From 2900f1f9952cd901156de62e66c74743da97c8bc Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Mon, 9 Oct 2017 17:56:40 -0700 Subject: [PATCH 01/11] Windows Support --- Makefile | 19 +- examples/ztproxy/ztproxy.cpp | 31 +- examples/ztproxy/ztproxy.hpp | 5 +- .../ports/unix/include}/cc.h | 0 .../ports/unix/include}/perf.h | 0 .../ports/unix/include}/sys_arch.h | 0 .../ports/win32/include/arch/cc.h | 2 +- .../include/{lwipopts.h => lwipopts.h.bak} | 0 .../ports/win32/{pcapif.c => pcapif.c.bak} | 0 .../ports/win32/{pcapif.h => pcapif.h.bak} | 0 .../{pcapif_helper.c => pcapif_helper.c.bak} | 0 .../ports/win32/{sio.c => sio.c.bak} | 0 .../ports/win32/{test.c => test.c.bak} | 0 ext/lwip/src/api/err.c | 2 +- ext/lwip/src/api/sockets.c | 13 +- .../src/include/lwip/{errno.h => errno.h.bak} | 386 +++++++++--------- include/Debug.hpp | 10 +- include/Defs.h | 2 + include/Utilities.h | 13 +- include/libzt.h | 7 +- include/lwIP.hpp | 6 +- make-liblwip.mk | 37 +- src/Platform.cpp | 3 +- src/Utilities.cpp | 167 ++++++++ src/VirtualSocket.cpp | 3 + src/VirtualTap.cpp | 4 +- src/libzt.cpp | 6 +- src/lwIP.cpp | 11 +- 28 files changed, 486 insertions(+), 241 deletions(-) rename ext/{lwip/src/include/arch => lwip-contrib/ports/unix/include}/cc.h (100%) rename ext/{lwip/src/include/arch => lwip-contrib/ports/unix/include}/perf.h (100%) rename ext/{lwip/src/include/arch => lwip-contrib/ports/unix/include}/sys_arch.h (100%) rename ext/lwip-contrib/ports/win32/include/{lwipopts.h => lwipopts.h.bak} (100%) rename ext/lwip-contrib/ports/win32/{pcapif.c => pcapif.c.bak} (100%) rename ext/lwip-contrib/ports/win32/{pcapif.h => pcapif.h.bak} (100%) rename ext/lwip-contrib/ports/win32/{pcapif_helper.c => pcapif_helper.c.bak} (100%) rename ext/lwip-contrib/ports/win32/{sio.c => sio.c.bak} (100%) rename ext/lwip-contrib/ports/win32/{test.c => test.c.bak} (100%) rename ext/lwip/src/include/lwip/{errno.h => errno.h.bak} (97%) mode change 100755 => 100644 diff --git a/Makefile b/Makefile index eec7e33..75f074d 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,18 @@ endif OSTYPE=$(shell uname -s | tr '[A-Z]' '[a-z]') BUILD=build/$(OSTYPE) +# Windows +ifeq ($(OSTYPE),mingw32_nt-6.2) +ARTOOL=ar +ARFLAGS=rcs +CC=gcc +CXX=g++ +CXXFLAGS+=-Wno-unknown-pragmas -Wno-pointer-arith -Wno-deprecated-declarations -Wno-conversion-null +WINDEFS=-lws2_32 -lshlwapi -liphlpapi -static -static-libgcc -static-libstdc++ +CONTRIBDIR=ext/lwip-contrib +LWIPARCH=$(CONTRIBDIR)/ports/win32 +LWIPARCHINCLUDE=$(LWIPARCH)/include +endif # Darwin ifeq ($(OSTYPE),darwin) ARTOOL=libtool @@ -147,7 +159,7 @@ ifeq ($(SDK_JNI), 1) LIBZT_DEFS+=-DSDK_JNI endif -CXXFLAGS=$(CFLAGS) -Wno-format -fno-rtti -std=c++11 +CXXFLAGS+=$(CFLAGS) -Wno-format -fno-rtti -std=c++11 ZT_DEFS+=-DZT_SDK -DZT_SOFTWARE_UPDATE_DEFAULT="\"disable\"" LIBZT_FILES:=src/VirtualTap.cpp src/libzt.cpp src/Utilities.cpp STATIC_LIB=$(BUILD)/libzt.a @@ -211,9 +223,8 @@ endif STACK_DRIVER_DEFS+=-DLWIP_DONT_PROVIDE_BYTEORDER_FUNCTIONS STACK_DRIVER_DEFS+=-DSTACK_LWIP STACK_DRIVER_FILES:=src/lwIP.cpp -LWIPARCH=$(CONTRIBDIR)/ports/unix LWIPDIR=ext/lwip/src -STACK_INCLUDES+=-Iext/lwip/src/include/lwip \ +STACK_INCLUDES+=$(LWIPARCHINCLUDE) -Iext/lwip/src/include/lwip \ -I$(LWIPDIR)/include \ -I$(LWIPARCH)/include \ -I$(LWIPDIR)/include/ipv4 \ @@ -349,7 +360,7 @@ nativetest: @date +"Build script finished on %F %T" ztproxy: $(CXX) $(CXXFLAGS) $(SANFLAGS) $(LIBZT_INCLUDES) $(LIBZT_DEFS) $(ZT_INCLUDES) \ - examples/ztproxy/ztproxy.cpp -o $(BUILD)/ztproxy $< -L$(BUILD) -lzt + examples/ztproxy/ztproxy.cpp -o $(BUILD)/ztproxy $< -L$(BUILD) -lzt $(WINDEFS) @./check.sh $(BUILD)/ztproxy @date +"Build script finished on %F %T" intercept: diff --git a/examples/ztproxy/ztproxy.cpp b/examples/ztproxy/ztproxy.cpp index 2556f04..d5a913c 100644 --- a/examples/ztproxy/ztproxy.cpp +++ b/examples/ztproxy/ztproxy.cpp @@ -26,12 +26,17 @@ #include #include -#include + +#if defined(__linux__) || defined(__APPLE__) + #include +#endif + +//#include "Winsock2.h" + #include #include #include #include -#include #include #include @@ -39,6 +44,7 @@ #include "RingBuffer.hpp" #include "ztproxy.hpp" +#include "Utilities.h" #include "libzt.h" namespace ZeroTier { @@ -69,7 +75,11 @@ namespace ZeroTier { in4.sin_addr.s_addr = Utils::hton((uint32_t)(0x7f000001)); // listen for TCP @127.0.0.1 in4.sin_port = Utils::hton((uint16_t)proxy_listen_port); _tcpListenSocket = _phy.tcpListen((const struct sockaddr *)&in4,this); + if (!_tcpListenSocket) { + DEBUG_ERROR("Error binding on port %d for IPv4 HTTP listen socket", proxy_listen_port); + } // IPv6 + /* struct sockaddr_in6 in6; memset((void *)&in6,0,sizeof(in6)); in6.sin6_family = AF_INET6; @@ -77,13 +87,12 @@ namespace ZeroTier { in6.sin6_addr.s6_addr[15] = 1; // IPv6 localhost == ::1 in6.sin6_port = Utils::hton((uint16_t)proxy_listen_port); _tcpListenSocket6 = _phy.tcpListen((const struct sockaddr *)&in6,this); - - if (!_tcpListenSocket) { - DEBUG_ERROR("Error binding on port %d for IPv4 HTTP listen socket", proxy_listen_port); - } + */ + /* if (!_tcpListenSocket6) { DEBUG_ERROR("Error binding on port %d for IPv6 HTTP listen socket", proxy_listen_port); } + */ _thread = Thread::start(this); } @@ -117,7 +126,7 @@ namespace ZeroTier { // Moves data between client application socket and libzt VirtualSocket while(_run) { - _phy.poll(5); + _phy.poll(1); conn_m.lock(); // build fd_sets to select upon @@ -212,7 +221,6 @@ namespace ZeroTier { exit(0); } if (conn->zfd < 0) { // no connection yet - DEBUG_INFO("no connection yet, will establish..."); if (host == "") { DEBUG_ERROR("invalid hostname or address (empty)"); return; @@ -256,6 +264,7 @@ namespace ZeroTier { } if (ipv == 6) { //DEBUG_INFO("attempting to proxy [0.0.0.0:%d -> %s:%d]", _proxy_listen_port, host.c_str(), dest_port); + /* struct sockaddr_in6 in6; memset(&in6,0,sizeof(in6)); in6.sin6_family = AF_INET; @@ -265,6 +274,7 @@ namespace ZeroTier { in6.sin6_port = Utils::hton(dest_port); zfd = zts_socket(AF_INET, SOCK_STREAM, 0); err = zts_connect(zfd, (const struct sockaddr *)&in6, sizeof(in6)); + */ } if (zfd < 0 || err < 0) { // now release TX buffer contents we previously saved, since we can't connect @@ -287,9 +297,6 @@ namespace ZeroTier { zmap[zfd] = conn; conn_m.unlock(); } - else { - DEBUG_INFO("connection already established, reusing..."); - } // Write data coming from client TCP connection to its TX buffer, later emptied into libzt by threadMain I/O loop conn->tx_m.lock(); if ((wr = conn->TXbuf->write((const unsigned char *)data, len)) < 0) { @@ -371,7 +378,7 @@ int main(int argc, char **argv) std::string nwid = argv[3]; std::string internal_addr = argv[4]; int internal_port = atoi(argv[5]); - std::string dns_nameserver= argv[6]; + std::string dns_nameserver= "";//argv[6]; ZeroTier::ZTProxy *proxy = new ZeroTier::ZTProxy(proxy_listen_port, nwid, path, internal_addr, internal_port, dns_nameserver); diff --git a/examples/ztproxy/ztproxy.hpp b/examples/ztproxy/ztproxy.hpp index 304ec53..91f49ab 100644 --- a/examples/ztproxy/ztproxy.hpp +++ b/examples/ztproxy/ztproxy.hpp @@ -34,10 +34,13 @@ #include "Phy.hpp" #include "OSUtils.hpp" +#if defined(__linux__) || defined(__APPLE__) + #include +#endif + #include #include #include -#include #define BUF_SZ 1024*1024 diff --git a/ext/lwip/src/include/arch/cc.h b/ext/lwip-contrib/ports/unix/include/cc.h similarity index 100% rename from ext/lwip/src/include/arch/cc.h rename to ext/lwip-contrib/ports/unix/include/cc.h diff --git a/ext/lwip/src/include/arch/perf.h b/ext/lwip-contrib/ports/unix/include/perf.h similarity index 100% rename from ext/lwip/src/include/arch/perf.h rename to ext/lwip-contrib/ports/unix/include/perf.h diff --git a/ext/lwip/src/include/arch/sys_arch.h b/ext/lwip-contrib/ports/unix/include/sys_arch.h similarity index 100% rename from ext/lwip/src/include/arch/sys_arch.h rename to ext/lwip-contrib/ports/unix/include/sys_arch.h diff --git a/ext/lwip-contrib/ports/win32/include/arch/cc.h b/ext/lwip-contrib/ports/win32/include/arch/cc.h index a1a2a70..56e6ebc 100644 --- a/ext/lwip-contrib/ports/win32/include/arch/cc.h +++ b/ext/lwip-contrib/ports/win32/include/arch/cc.h @@ -39,7 +39,7 @@ #pragma warning (disable: 4820) /* 'x' bytes padding added after data member 'y' */ #endif -#define LWIP_PROVIDE_ERRNO +#define LWIP_PROVIDE_ERRNO 0 /* Define platform endianness (might already be defined) */ #ifndef BYTE_ORDER diff --git a/ext/lwip-contrib/ports/win32/include/lwipopts.h b/ext/lwip-contrib/ports/win32/include/lwipopts.h.bak similarity index 100% rename from ext/lwip-contrib/ports/win32/include/lwipopts.h rename to ext/lwip-contrib/ports/win32/include/lwipopts.h.bak diff --git a/ext/lwip-contrib/ports/win32/pcapif.c b/ext/lwip-contrib/ports/win32/pcapif.c.bak similarity index 100% rename from ext/lwip-contrib/ports/win32/pcapif.c rename to ext/lwip-contrib/ports/win32/pcapif.c.bak diff --git a/ext/lwip-contrib/ports/win32/pcapif.h b/ext/lwip-contrib/ports/win32/pcapif.h.bak similarity index 100% rename from ext/lwip-contrib/ports/win32/pcapif.h rename to ext/lwip-contrib/ports/win32/pcapif.h.bak diff --git a/ext/lwip-contrib/ports/win32/pcapif_helper.c b/ext/lwip-contrib/ports/win32/pcapif_helper.c.bak similarity index 100% rename from ext/lwip-contrib/ports/win32/pcapif_helper.c rename to ext/lwip-contrib/ports/win32/pcapif_helper.c.bak diff --git a/ext/lwip-contrib/ports/win32/sio.c b/ext/lwip-contrib/ports/win32/sio.c.bak similarity index 100% rename from ext/lwip-contrib/ports/win32/sio.c rename to ext/lwip-contrib/ports/win32/sio.c.bak diff --git a/ext/lwip-contrib/ports/win32/test.c b/ext/lwip-contrib/ports/win32/test.c.bak similarity index 100% rename from ext/lwip-contrib/ports/win32/test.c rename to ext/lwip-contrib/ports/win32/test.c.bak diff --git a/ext/lwip/src/api/err.c b/ext/lwip/src/api/err.c index 6e9ab76..5c795a4 100755 --- a/ext/lwip/src/api/err.c +++ b/ext/lwip/src/api/err.c @@ -40,7 +40,7 @@ #include "lwip/def.h" #include "lwip/sys.h" -#include "lwip/errno.h" +#include "errno.h" #if !NO_SYS /** Table to quickly map an lwIP error (err_t) to a socket error diff --git a/ext/lwip/src/api/sockets.c b/ext/lwip/src/api/sockets.c index d72724f..387e035 100755 --- a/ext/lwip/src/api/sockets.c +++ b/ext/lwip/src/api/sockets.c @@ -658,10 +658,19 @@ lwip_connect(int s, const struct sockaddr *name, socklen_t namelen) } LWIP_UNUSED_ARG(namelen); - if (name->sa_family == AF_UNSPEC) { + + if ( +#ifdef __MINGW32__ + false +#else + name->sa_family == AF_UNSPEC +#endif + ) + { LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_connect(%d, AF_UNSPEC)\n", s)); err = netconn_disconnect(sock->conn); - } else { + } + else { ip_addr_t remote_addr; u16_t remote_port; diff --git a/ext/lwip/src/include/lwip/errno.h b/ext/lwip/src/include/lwip/errno.h.bak old mode 100755 new mode 100644 similarity index 97% rename from ext/lwip/src/include/lwip/errno.h rename to ext/lwip/src/include/lwip/errno.h.bak index 47a4ff2..1c770df --- a/ext/lwip/src/include/lwip/errno.h +++ b/ext/lwip/src/include/lwip/errno.h.bak @@ -1,193 +1,193 @@ -/** - * @file - * Posix Errno defines - */ - -/* - * Copyright (c) 2001-2004 Swedish Institute of Computer Science. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT - * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY - * OF SUCH DAMAGE. - * - * This file is part of the lwIP TCP/IP stack. - * - * Author: Adam Dunkels - * - */ -#ifndef LWIP_HDR_ERRNO_H -#define LWIP_HDR_ERRNO_H - -#include "lwip/opt.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef LWIP_PROVIDE_ERRNO - -#define EPERM 1 /* Operation not permitted */ -#define ENOENT 2 /* No such file or directory */ -#define ESRCH 3 /* No such process */ -#define EINTR 4 /* Interrupted system call */ -#define EIO 5 /* I/O error */ -#define ENXIO 6 /* No such device or address */ -#define E2BIG 7 /* Arg list too long */ -#define ENOEXEC 8 /* Exec format error */ -#define EBADF 9 /* Bad file number */ -#define ECHILD 10 /* No child processes */ -#define EAGAIN 11 /* Try again */ -#define ENOMEM 12 /* Out of memory */ -#define EACCES 13 /* Permission denied */ -#define EFAULT 14 /* Bad address */ -#define ENOTBLK 15 /* Block device required */ -#define EBUSY 16 /* Device or resource busy */ -#define EEXIST 17 /* File exists */ -#define EXDEV 18 /* Cross-device link */ -#define ENODEV 19 /* No such device */ -#define ENOTDIR 20 /* Not a directory */ -#define EISDIR 21 /* Is a directory */ -#define EINVAL 22 /* Invalid argument */ -#define ENFILE 23 /* File table overflow */ -#define EMFILE 24 /* Too many open files */ -#define ENOTTY 25 /* Not a typewriter */ -#define ETXTBSY 26 /* Text file busy */ -#define EFBIG 27 /* File too large */ -#define ENOSPC 28 /* No space left on device */ -#define ESPIPE 29 /* Illegal seek */ -#define EROFS 30 /* Read-only file system */ -#define EMLINK 31 /* Too many links */ -#define EPIPE 32 /* Broken pipe */ -#define EDOM 33 /* Math argument out of domain of func */ -#define ERANGE 34 /* Math result not representable */ -#define EDEADLK 35 /* Resource deadlock would occur */ -#define ENAMETOOLONG 36 /* File name too long */ -#define ENOLCK 37 /* No record locks available */ -#define ENOSYS 38 /* Function not implemented */ -#define ENOTEMPTY 39 /* Directory not empty */ -#define ELOOP 40 /* Too many symbolic links encountered */ -#define EWOULDBLOCK EAGAIN /* Operation would block */ -#define ENOMSG 42 /* No message of desired type */ -#define EIDRM 43 /* Identifier removed */ -#define ECHRNG 44 /* Channel number out of range */ -#define EL2NSYNC 45 /* Level 2 not synchronized */ -#define EL3HLT 46 /* Level 3 halted */ -#define EL3RST 47 /* Level 3 reset */ -#define ELNRNG 48 /* Link number out of range */ -#define EUNATCH 49 /* Protocol driver not attached */ -#define ENOCSI 50 /* No CSI structure available */ -#define EL2HLT 51 /* Level 2 halted */ -#define EBADE 52 /* Invalid exchange */ -#define EBADR 53 /* Invalid request descriptor */ -#define EXFULL 54 /* Exchange full */ -#define ENOANO 55 /* No anode */ -#define EBADRQC 56 /* Invalid request code */ -#define EBADSLT 57 /* Invalid slot */ - -#define EDEADLOCK EDEADLK - -#define EBFONT 59 /* Bad font file format */ -#define ENOSTR 60 /* Device not a stream */ -#define ENODATA 61 /* No data available */ -#define ETIME 62 /* Timer expired */ -#define ENOSR 63 /* Out of streams resources */ -#define ENONET 64 /* Machine is not on the network */ -#define ENOPKG 65 /* Package not installed */ -#define EREMOTE 66 /* Object is remote */ -#define ENOLINK 67 /* Link has been severed */ -#define EADV 68 /* Advertise error */ -#define ESRMNT 69 /* Srmount error */ -#define ECOMM 70 /* Communication error on send */ -#define EPROTO 71 /* Protocol error */ -#define EMULTIHOP 72 /* Multihop attempted */ -#define EDOTDOT 73 /* RFS specific error */ -#define EBADMSG 74 /* Not a data message */ -#define EOVERFLOW 75 /* Value too large for defined data type */ -#define ENOTUNIQ 76 /* Name not unique on network */ -#define EBADFD 77 /* File descriptor in bad state */ -#define EREMCHG 78 /* Remote address changed */ -#define ELIBACC 79 /* Can not access a needed shared library */ -#define ELIBBAD 80 /* Accessing a corrupted shared library */ -#define ELIBSCN 81 /* .lib section in a.out corrupted */ -#define ELIBMAX 82 /* Attempting to link in too many shared libraries */ -#define ELIBEXEC 83 /* Cannot exec a shared library directly */ -#define EILSEQ 84 /* Illegal byte sequence */ -#define ERESTART 85 /* Interrupted system call should be restarted */ -#define ESTRPIPE 86 /* Streams pipe error */ -#define EUSERS 87 /* Too many users */ -#define ENOTSOCK 88 /* Socket operation on non-socket */ -#define EDESTADDRREQ 89 /* Destination address required */ -#define EMSGSIZE 90 /* Message too long */ -#define EPROTOTYPE 91 /* Protocol wrong type for socket */ -#define ENOPROTOOPT 92 /* Protocol not available */ -#define EPROTONOSUPPORT 93 /* Protocol not supported */ -#define ESOCKTNOSUPPORT 94 /* Socket type not supported */ -#define EOPNOTSUPP 95 /* Operation not supported on transport endpoint */ -#define EPFNOSUPPORT 96 /* Protocol family not supported */ -#define EAFNOSUPPORT 97 /* Address family not supported by protocol */ -#define EADDRINUSE 98 /* Address already in use */ -#define EADDRNOTAVAIL 99 /* Cannot assign requested address */ -#define ENETDOWN 100 /* Network is down */ -#define ENETUNREACH 101 /* Network is unreachable */ -#define ENETRESET 102 /* Network dropped connection because of reset */ -#define ECONNABORTED 103 /* Software caused connection abort */ -#define ECONNRESET 104 /* Connection reset by peer */ -#define ENOBUFS 105 /* No buffer space available */ -#define EISCONN 106 /* Transport endpoint is already connected */ -#define ENOTCONN 107 /* Transport endpoint is not connected */ -#define ESHUTDOWN 108 /* Cannot send after transport endpoint shutdown */ -#define ETOOMANYREFS 109 /* Too many references: cannot splice */ -#define ETIMEDOUT 110 /* Connection timed out */ -#define ECONNREFUSED 111 /* Connection refused */ -#define EHOSTDOWN 112 /* Host is down */ -#define EHOSTUNREACH 113 /* No route to host */ -#define EALREADY 114 /* Operation already in progress */ -#define EINPROGRESS 115 /* Operation now in progress */ -#define ESTALE 116 /* Stale NFS file handle */ -#define EUCLEAN 117 /* Structure needs cleaning */ -#define ENOTNAM 118 /* Not a XENIX named type file */ -#define ENAVAIL 119 /* No XENIX semaphores available */ -#define EISNAM 120 /* Is a named type file */ -#define EREMOTEIO 121 /* Remote I/O error */ -#define EDQUOT 122 /* Quota exceeded */ - -#define ENOMEDIUM 123 /* No medium found */ -#define EMEDIUMTYPE 124 /* Wrong medium type */ - -#ifndef errno -extern int errno; -#endif - -#else /* LWIP_PROVIDE_ERRNO */ - -/* Define LWIP_ERRNO_INCLUDE to to include the error defines here */ -#ifdef LWIP_ERRNO_INCLUDE -//#include -#endif /* LWIP_ERRNO_INCLUDE */ - -#endif /* LWIP_PROVIDE_ERRNO */ - -#ifdef __cplusplus -} -#endif - -#endif /* LWIP_HDR_ERRNO_H */ +/** + * @file + * Posix Errno defines + */ + +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef LWIP_HDR_ERRNO_H +#define LWIP_HDR_ERRNO_H + +#include "lwip/opt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef LWIP_PROVIDE_ERRNO + +#define EPERM 1 /* Operation not permitted */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* Interrupted system call */ +#define EIO 5 /* I/O error */ +#define ENXIO 6 /* No such device or address */ +#define E2BIG 7 /* Arg list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file number */ +#define ECHILD 10 /* No child processes */ +#define EAGAIN 11 /* Try again */ +#define ENOMEM 12 /* Out of memory */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#define ENOTBLK 15 /* Block device required */ +#define EBUSY 16 /* Device or resource busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* No such device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* File table overflow */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Not a typewriter */ +#define ETXTBSY 26 /* Text file busy */ +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read-only file system */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ +#define EDOM 33 /* Math argument out of domain of func */ +#define ERANGE 34 /* Math result not representable */ +#define EDEADLK 35 /* Resource deadlock would occur */ +#define ENAMETOOLONG 36 /* File name too long */ +#define ENOLCK 37 /* No record locks available */ +#define ENOSYS 38 /* Function not implemented */ +#define ENOTEMPTY 39 /* Directory not empty */ +#define ELOOP 40 /* Too many symbolic links encountered */ +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#define ENOMSG 42 /* No message of desired type */ +#define EIDRM 43 /* Identifier removed */ +#define ECHRNG 44 /* Channel number out of range */ +#define EL2NSYNC 45 /* Level 2 not synchronized */ +#define EL3HLT 46 /* Level 3 halted */ +#define EL3RST 47 /* Level 3 reset */ +#define ELNRNG 48 /* Link number out of range */ +#define EUNATCH 49 /* Protocol driver not attached */ +#define ENOCSI 50 /* No CSI structure available */ +#define EL2HLT 51 /* Level 2 halted */ +#define EBADE 52 /* Invalid exchange */ +#define EBADR 53 /* Invalid request descriptor */ +#define EXFULL 54 /* Exchange full */ +#define ENOANO 55 /* No anode */ +#define EBADRQC 56 /* Invalid request code */ +#define EBADSLT 57 /* Invalid slot */ + +#define EDEADLOCK EDEADLK + +#define EBFONT 59 /* Bad font file format */ +#define ENOSTR 60 /* Device not a stream */ +#define ENODATA 61 /* No data available */ +#define ETIME 62 /* Timer expired */ +#define ENOSR 63 /* Out of streams resources */ +#define ENONET 64 /* Machine is not on the network */ +#define ENOPKG 65 /* Package not installed */ +#define EREMOTE 66 /* Object is remote */ +#define ENOLINK 67 /* Link has been severed */ +#define EADV 68 /* Advertise error */ +#define ESRMNT 69 /* Srmount error */ +#define ECOMM 70 /* Communication error on send */ +#define EPROTO 71 /* Protocol error */ +#define EMULTIHOP 72 /* Multihop attempted */ +#define EDOTDOT 73 /* RFS specific error */ +#define EBADMSG 74 /* Not a data message */ +#define EOVERFLOW 75 /* Value too large for defined data type */ +#define ENOTUNIQ 76 /* Name not unique on network */ +#define EBADFD 77 /* File descriptor in bad state */ +#define EREMCHG 78 /* Remote address changed */ +#define ELIBACC 79 /* Can not access a needed shared library */ +#define ELIBBAD 80 /* Accessing a corrupted shared library */ +#define ELIBSCN 81 /* .lib section in a.out corrupted */ +#define ELIBMAX 82 /* Attempting to link in too many shared libraries */ +#define ELIBEXEC 83 /* Cannot exec a shared library directly */ +#define EILSEQ 84 /* Illegal byte sequence */ +#define ERESTART 85 /* Interrupted system call should be restarted */ +#define ESTRPIPE 86 /* Streams pipe error */ +#define EUSERS 87 /* Too many users */ +#define ENOTSOCK 88 /* Socket operation on non-socket */ +#define EDESTADDRREQ 89 /* Destination address required */ +#define EMSGSIZE 90 /* Message too long */ +#define EPROTOTYPE 91 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 92 /* Protocol not available */ +#define EPROTONOSUPPORT 93 /* Protocol not supported */ +#define ESOCKTNOSUPPORT 94 /* Socket type not supported */ +#define EOPNOTSUPP 95 /* Operation not supported on transport endpoint */ +#define EPFNOSUPPORT 96 /* Protocol family not supported */ +#define EAFNOSUPPORT 97 /* Address family not supported by protocol */ +#define EADDRINUSE 98 /* Address already in use */ +#define EADDRNOTAVAIL 99 /* Cannot assign requested address */ +#define ENETDOWN 100 /* Network is down */ +#define ENETUNREACH 101 /* Network is unreachable */ +#define ENETRESET 102 /* Network dropped connection because of reset */ +#define ECONNABORTED 103 /* Software caused connection abort */ +#define ECONNRESET 104 /* Connection reset by peer */ +#define ENOBUFS 105 /* No buffer space available */ +#define EISCONN 106 /* Transport endpoint is already connected */ +#define ENOTCONN 107 /* Transport endpoint is not connected */ +#define ESHUTDOWN 108 /* Cannot send after transport endpoint shutdown */ +#define ETOOMANYREFS 109 /* Too many references: cannot splice */ +#define ETIMEDOUT 110 /* Connection timed out */ +#define ECONNREFUSED 111 /* Connection refused */ +#define EHOSTDOWN 112 /* Host is down */ +#define EHOSTUNREACH 113 /* No route to host */ +#define EALREADY 114 /* Operation already in progress */ +#define EINPROGRESS 115 /* Operation now in progress */ +#define ESTALE 116 /* Stale NFS file handle */ +#define EUCLEAN 117 /* Structure needs cleaning */ +#define ENOTNAM 118 /* Not a XENIX named type file */ +#define ENAVAIL 119 /* No XENIX semaphores available */ +#define EISNAM 120 /* Is a named type file */ +#define EREMOTEIO 121 /* Remote I/O error */ +#define EDQUOT 122 /* Quota exceeded */ + +#define ENOMEDIUM 123 /* No medium found */ +#define EMEDIUMTYPE 124 /* Wrong medium type */ + +#ifndef errno +extern int errno; +#endif + +#else /* LWIP_PROVIDE_ERRNO */ + +/* Define LWIP_ERRNO_INCLUDE to to include the error defines here */ +#ifdef LWIP_ERRNO_INCLUDE +//#include +#endif /* LWIP_ERRNO_INCLUDE */ + +#endif /* LWIP_PROVIDE_ERRNO */ + +#ifdef __cplusplus +} +#endif + +#endif /* LWIP_HDR_ERRNO_H */ diff --git a/include/Debug.hpp b/include/Debug.hpp index b306d1e..d807cbb 100644 --- a/include/Debug.hpp +++ b/include/Debug.hpp @@ -34,11 +34,14 @@ #define LIBZT_DEBUG_HPP #include -#include #include #include #include +#if defined(__linux__) +#include +#endif + #include "Platform.h" #define ZT_MSG_ERROR true // Errors @@ -53,7 +56,7 @@ #if defined(__APPLE__) #include "TargetConditionals.h" #endif -#if defined(ZT_COLOR) && !defined(__ANDROID__) && !defined(TARGET_OS_IPHONE) && !defined(TARGET_IPHONE_SIMULATOR) && !defined(__APP_FRAMEWORK__) +#if defined(ZT_COLOR) && !defined(__MINGW32__) && !defined(__ANDROID__) && !defined(TARGET_OS_IPHONE) && !defined(TARGET_IPHONE_SIMULATOR) && !defined(__APP_FRAMEWORK__) #define ZT_RED "\x1B[31m" #define ZT_GRN "\x1B[32m" #define ZT_YEL "\x1B[33m" @@ -88,6 +91,9 @@ extern unsigned int gettid(); // defined in libzt.cpp #elif __APPLE__ #define ZT_THREAD_ID (long)0//(long)gettid() #endif +#ifdef __MINGW32__ + #define ZT_THREAD_ID (long)0 +#endif #if defined(__JNI_LIB__) #include diff --git a/include/Defs.h b/include/Defs.h index 02a7936..8c2776c 100644 --- a/include/Defs.h +++ b/include/Defs.h @@ -48,7 +48,9 @@ */ #define ZTO_ID_LEN 16 +#if !defined(__MINGW32__) typedef uint32_t socklen_t; +#endif /****************************************************************************/ /* For SOCK_RAW support, it will initially be modeled after linux's API, so */ diff --git a/include/Utilities.h b/include/Utilities.h index 625c6db..ae1b6a1 100644 --- a/include/Utilities.h +++ b/include/Utilities.h @@ -35,6 +35,17 @@ #include "InetAddress.hpp" +#if defined(__MINGW32__) + +#define NS_INADDRSZ 4 +#define NS_IN6ADDRSZ 16 +#define NS_INT16SZ 2 + +int inet_pton4(const char *src, void *dst); +int inet_pton6(const char *src, void *dst); +int inet_pton(int af, const char *src, void *dst); +#endif + /** * @brief Returns masked address for subnet comparisons * @@ -62,7 +73,7 @@ char *beautify_eth_proto_nums(int proto); * @param inet * @return */ -void sockaddr2inet(int socket_family, const struct sockaddr *addr, ZeroTier::InetAddress *inet); +//void sockaddr2inet(int socket_family, const struct sockaddr *addr, ZeroTier::InetAddress *inet); /** * @brief Convert a raw MAC address byte array into a human-readable string diff --git a/include/libzt.h b/include/libzt.h index 714a603..5fa678b 100644 --- a/include/libzt.h +++ b/include/libzt.h @@ -33,11 +33,14 @@ #ifndef LIBZT_H #define LIBZT_H -#include #include #include #include +#if defined(__linux__) + #include +#endif + #include "Debug.hpp" #include "Defs.h" @@ -440,7 +443,9 @@ int zts_close(int fd); * @param timeout * @return */ +#if defined(__linux__) int zts_poll(struct pollfd *fds, nfds_t nfds, int timeout); +#endif /** * @brief Monitor multiple file descriptors, waiting until one or more of the file descriptors become "ready" diff --git a/include/lwIP.hpp b/include/lwIP.hpp index 236d1df..1c05742 100644 --- a/include/lwIP.hpp +++ b/include/lwIP.hpp @@ -60,12 +60,12 @@ void lwip_dns_init(); * @usage lwip_driver_init() * @return */ -void lwip_start_dhcp(struct netif *interface); +void lwip_start_dhcp(void *netif); -void general_lwip_init_interface(void *tapref, struct netif *interface, const char *name, const ZeroTier::MAC &mac, +void general_lwip_init_interface(void *tapref, void *netif, const char *name, const ZeroTier::MAC &mac, const ZeroTier::InetAddress &addr, const ZeroTier::InetAddress &nm, const ZeroTier::InetAddress &gw); -void general_turn_on_interface(struct netif *interface); +void general_turn_on_interface(void *netif); /** * @brief Set up an interface in the network stack for the VirtualTap. diff --git a/make-liblwip.mk b/make-liblwip.mk index 04638af..3c738af 100644 --- a/make-liblwip.mk +++ b/make-liblwip.mk @@ -32,21 +32,38 @@ # CONTRIBDIR=ext/lwip-contrib -LWIPARCH=$(CONTRIBDIR)/ports/unix - -#Set this to where you have the lwip core module checked out from CVS -#default assumes it's a dir named lwip at the same level as the contrib module LWIPDIR=ext/lwip/src - CCDEP=clang++ + # Automagically pick clang or gcc, with preference for clang # This is only done if we have not overridden these with an environment or CLI variable -ifeq ($(origin CCX),default) - CCX=$(shell if [ -e /usr/bin/clang++ ]; then echo clang++; else echo g++; fi) +ifeq ($(origin CXX),default) + CXX=$(shell if [ -e /usr/bin/clang++ ]; then echo clang++; else echo g++; fi) endif -LWIPINCLUDES:=-I$(LWIPDIR)/include -I$(LWIPARCH) -I$(LWIPDIR) -I. -Iext -Iinclude -CFLAGS=-Wno-format -Wno-deprecated -O3 -g -Wall -fPIC +OSTYPE=$(shell uname -s | tr '[A-Z]' '[a-z]') +BUILD=build/$(OSTYPE) + +CCX=clang++ + +# Windows +ifeq ($(OSTYPE),mingw32_nt-6.2) +CCX=g++ +WINDEFS=-Wno-c++11-compat -std=c++98 +LWIPARCH=$(CONTRIBDIR)/ports/win32 +endif +ifeq ($(OSTYPE),linux) +LWIPARCH=$(CONTRIBDIR)/ports/unix +endif +ifeq ($(OSTYPE),darwin) +LWIPARCH=$(CONTRIBDIR)/ports/unix +endif +ifeq ($(OSTYPE),freebsd) +LWIPARCH=$(CONTRIBDIR)/ports/unix +endif + +LWIPINCLUDES:=-I$(LWIPDIR)/include -I$(LWIPARCH) -I$(LWIPARCH)/include -I$(LWIPDIR) -I. -Iext -Iinclude +CFLAGS= $(WINDEFS) -Wno-format -Wno-deprecated -O3 -g -Wall -fPIC CFLAGS+=-DLWIP_IPV4 -DLWIP_IPV6=0 -DIPv4 -DLWIP_DEBUG=1 $(LWIPINCLUDES) UNIXLIB=liblwip.a @@ -63,8 +80,6 @@ LWIPNOAPPSFILES+=$(ARCHFILES) LWIPNOAPPSFILESW=$(wildcard $(LWIPNOAPPSFILES)) LWIPNOAPPSOBJS=$(notdir $(LWIPNOAPPSFILESW:.c=.o)) -CCX=clang++ - %.o: $(CCX) $(CFLAGS) -c $(<:.o=.c) diff --git a/src/Platform.cpp b/src/Platform.cpp index 51b2b4f..5cbc1ae 100644 --- a/src/Platform.cpp +++ b/src/Platform.cpp @@ -53,7 +53,8 @@ void handle_general_failure() { inline unsigned int gettid() { #ifdef _WIN32 - return GetCurrentThreadId(); + //return GetCurrentThreadId(); + return 0; #elif defined(__linux__) return static_cast(syscall(__NR_gettid)); #elif defined(__APPLE__) diff --git a/src/Utilities.cpp b/src/Utilities.cpp index 7c1d1bb..d0010e7 100644 --- a/src/Utilities.cpp +++ b/src/Utilities.cpp @@ -32,6 +32,171 @@ #include "InetAddress.hpp" #include "Debug.hpp" +#include "Utilities.h" + +#if defined(__MINGW32__) + +int inet_pton4(const char *src, void *dst) +{ + uint8_t tmp[NS_INADDRSZ], *tp; + + int saw_digit = 0; + int octets = 0; + *(tp = tmp) = 0; + + int ch; + while ((ch = *src++) != '\0') + { + if (ch >= '0' && ch <= '9') + { + uint32_t n = *tp * 10 + (ch - '0'); + + if (saw_digit && *tp == 0) + return 0; + + if (n > 255) + return 0; + + *tp = n; + if (!saw_digit) + { + if (++octets > 4) + return 0; + saw_digit = 1; + } + } + else if (ch == '.' && saw_digit) + { + if (octets == 4) + return 0; + *++tp = 0; + saw_digit = 0; + } + else + return 0; + } + if (octets < 4) + return 0; + + memcpy(dst, tmp, NS_INADDRSZ); + + return 1; +} + +int inet_pton6(const char *src, void *dst) +{ + static const char xdigits[] = "0123456789abcdef"; + uint8_t tmp[NS_IN6ADDRSZ]; + + uint8_t *tp = (uint8_t*) memset(tmp, '\0', NS_IN6ADDRSZ); + uint8_t *endp = tp + NS_IN6ADDRSZ; + uint8_t *colonp = NULL; + + /* Leading :: requires some special handling. */ + if (*src == ':') + { + if (*++src != ':') + return 0; + } + + const char *curtok = src; + int saw_xdigit = 0; + uint32_t val = 0; + int ch; + while ((ch = tolower(*src++)) != '\0') + { + const char *pch = strchr(xdigits, ch); + if (pch != NULL) + { + val <<= 4; + val |= (pch - xdigits); + if (val > 0xffff) + return 0; + saw_xdigit = 1; + continue; + } + if (ch == ':') + { + curtok = src; + if (!saw_xdigit) + { + if (colonp) + return 0; + colonp = tp; + continue; + } + else if (*src == '\0') + { + return 0; + } + if (tp + NS_INT16SZ > endp) + return 0; + *tp++ = (uint8_t) (val >> 8) & 0xff; + *tp++ = (uint8_t) val & 0xff; + saw_xdigit = 0; + val = 0; + continue; + } + if (ch == '.' && ((tp + NS_INADDRSZ) <= endp) && + inet_pton4(curtok, (char*) tp) > 0) + { + tp += NS_INADDRSZ; + saw_xdigit = 0; + break; /* '\0' was seen by inet_pton4(). */ + } + return 0; + } + if (saw_xdigit) + { + if (tp + NS_INT16SZ > endp) + return 0; + *tp++ = (uint8_t) (val >> 8) & 0xff; + *tp++ = (uint8_t) val & 0xff; + } + if (colonp != NULL) + { + /* + * Since some memmove()'s erroneously fail to handle + * overlapping regions, we'll do the shift by hand. + */ + const int n = tp - colonp; + + if (tp == endp) + return 0; + + for (int i = 1; i <= n; i++) + { + endp[-i] = colonp[n - i]; + colonp[n - i] = 0; + } + tp = endp; + } + if (tp != endp) + return 0; + + memcpy(dst, tmp, NS_IN6ADDRSZ); + + return 1; +} + +int inet_pton(int af, const char *src, void *dst) +{ + switch (af) + { + case AF_INET: + return inet_pton4(src, dst); + case AF_INET6: + return inet_pton6(src, dst); + default: + return -1; + } +} + +#endif + + + + char *beautify_eth_proto_nums(int proto) { @@ -120,6 +285,7 @@ bool ipv6_in_subnet(ZeroTier::InetAddress *subnet, ZeroTier::InetAddress *addr) return !strcmp(r.toIpString(b0), b.toIpString(b1)); } +/* void sockaddr2inet(int socket_family, const struct sockaddr *addr, ZeroTier::InetAddress *inet) { char ipstr[INET6_ADDRSTRLEN]; @@ -137,6 +303,7 @@ void sockaddr2inet(int socket_family, const struct sockaddr *addr, ZeroTier::Ine inet->fromString(addrstr); } } +*/ void mac2str(char *macbuf, int len, unsigned char* addr) { diff --git a/src/VirtualSocket.cpp b/src/VirtualSocket.cpp index 9bb0edc..6cc872a 100644 --- a/src/VirtualSocket.cpp +++ b/src/VirtualSocket.cpp @@ -34,7 +34,10 @@ #define ZT_VIRTUALSOCKET_HPP #include + +#if defined(__linux__) || #defined(__APPLE__) #include +#endif #include "VirtualSocket.h" #include "VirtualBindingPair.h" diff --git a/src/VirtualTap.cpp b/src/VirtualTap.cpp index 85cb9b0..c33a0c0 100644 --- a/src/VirtualTap.cpp +++ b/src/VirtualTap.cpp @@ -421,7 +421,7 @@ namespace ZeroTier { target_addr = managed_routes->at(i).target; via_addr = managed_routes->at(i).via; nm = target_addr.netmask(); - for (int j=0; jat(j).target; diff --git a/src/libzt.cpp b/src/libzt.cpp index 662ba2b..2775851 100644 --- a/src/libzt.cpp +++ b/src/libzt.cpp @@ -66,7 +66,7 @@ void sys2lwip(int fd, const struct sockaddr *orig, struct sockaddr *modified) } #if defined(LIBZT_IPV4) if (ss.ss_family == AF_INET) { -#if defined(__linux__) +#if defined(__linux__) || defined(__MINGW32__) struct sockaddr_in *p4 = (struct sockaddr_in *)modified; struct sockaddr_in *addr4 = (struct sockaddr_in*)orig; p4->sin_len = sizeof(struct sockaddr_in); @@ -80,7 +80,7 @@ void sys2lwip(int fd, const struct sockaddr *orig, struct sockaddr *modified) #if defined(LIBZT_IPV6) if (ss.ss_family == AF_INET6) { -#if defined(__linux__) +#if defined(__linux__) || defined(__MINGW32__) struct sockaddr_in6 *p6 = (struct sockaddr_in6 *)modified; struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)orig; p6->sin6_len = sizeof(struct sockaddr_in6); @@ -311,6 +311,7 @@ int zts_close(int fd) return err; } +#if defined(__linux__) int zts_poll(struct pollfd *fds, nfds_t nfds, int timeout) { int err = -1; @@ -324,6 +325,7 @@ int zts_poll(struct pollfd *fds, nfds_t nfds, int timeout) #endif return err; } +#endif int zts_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) diff --git a/src/lwIP.cpp b/src/lwIP.cpp index 429335c..0e3c309 100644 --- a/src/lwIP.cpp +++ b/src/lwIP.cpp @@ -115,6 +115,9 @@ void lwip_driver_init() if (lwip_driver_initialized == true) { return; } +#if defined(__MINGW32__) + sys_init(); // required for win32 initializtion of critical sections +#endif sys_thread_new("main_network_stack_thread", main_network_stack_thread, NULL, DEFAULT_THREAD_STACKSIZE, DEFAULT_THREAD_PRIO); } @@ -160,7 +163,7 @@ err_t lwip_eth_tx(struct netif *netif, struct pbuf *p) return ERR_OK; } -void general_lwip_init_interface(void *tapref, struct netif *interface, const char *name, const ZeroTier::MAC &mac, const ZeroTier::InetAddress &addr, const ZeroTier::InetAddress &nm, const ZeroTier::InetAddress &gw) +void general_lwip_init_interface(void *tapref, void *netif, const char *name, const ZeroTier::MAC &mac, const ZeroTier::InetAddress &addr, const ZeroTier::InetAddress &nm, const ZeroTier::InetAddress &gw) { #if defined(LIBZT_IPV4) char ipbuf[INET6_ADDRSTRLEN], nmbuf[INET6_ADDRSTRLEN], gwbuf[INET6_ADDRSTRLEN]; @@ -192,7 +195,7 @@ void general_lwip_init_interface(void *tapref, struct netif *interface, const ch #endif } -void general_turn_on_interface(struct netif *interface) +void general_turn_on_interface(void *netif) { //netif_set_up(&n1); //netif_set_default(&n1); @@ -206,9 +209,9 @@ void lwip_dns_init() dns_init(); } -void lwip_start_dhcp(struct netif *interface) +void lwip_start_dhcp(void *netif) { - netifapi_dhcp_start(interface); + netifapi_dhcp_start((struct netif *)netif); } void lwip_init_interface(void *tapref, const ZeroTier::MAC &mac, const ZeroTier::InetAddress &ip) From 29e31dd60a608bdd9e1c735e395bb19d2f3f274b Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 11:40:14 -0700 Subject: [PATCH 02/11] Fixes for Unix-like builds after introduction of Windows code --- Makefile | 11 +++++++---- .../ports/unix/include/{ => arch}/cc.h | 0 .../ports/unix/include/{ => arch}/perf.h | 0 .../ports/unix/include/{ => arch}/sys_arch.h | 0 src/VirtualSocket.cpp | 2 +- src/VirtualTap.hpp | 14 ++++++++++++++ 6 files changed, 22 insertions(+), 5 deletions(-) rename ext/lwip-contrib/ports/unix/include/{ => arch}/cc.h (100%) rename ext/lwip-contrib/ports/unix/include/{ => arch}/perf.h (100%) rename ext/lwip-contrib/ports/unix/include/{ => arch}/sys_arch.h (100%) diff --git a/Makefile b/Makefile index 75f074d..5b97fdd 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,7 @@ endif OSTYPE=$(shell uname -s | tr '[A-Z]' '[a-z]') BUILD=build/$(OSTYPE) +LWIPCONTRIBDIR=ext/lwip-contrib # Windows ifeq ($(OSTYPE),mingw32_nt-6.2) @@ -44,29 +45,31 @@ CC=gcc CXX=g++ CXXFLAGS+=-Wno-unknown-pragmas -Wno-pointer-arith -Wno-deprecated-declarations -Wno-conversion-null WINDEFS=-lws2_32 -lshlwapi -liphlpapi -static -static-libgcc -static-libstdc++ -CONTRIBDIR=ext/lwip-contrib -LWIPARCH=$(CONTRIBDIR)/ports/win32 -LWIPARCHINCLUDE=$(LWIPARCH)/include +LWIPARCHINCLUDE=$(LWIPCONTRIBDIR)/ports/win32/include endif # Darwin ifeq ($(OSTYPE),darwin) ARTOOL=libtool ARFLAGS=-static +LWIPARCHINCLUDE=$(LWIPCONTRIBDIR)/ports/unix/include endif # Linux ifeq ($(OSTYPE),linux) ARTOOL=ar ARFLAGS=rcs +LWIPARCHINCLUDE=$(LWIPCONTRIBDIR)/ports/unix/include endif # FreeBSD ifeq ($(OSTYPE),freebsd) ARTOOL=ar ARFLAGS=rcs +LWIPARCHINCLUDE=$(LWIPCONTRIBDIR)/ports/unix/include endif # OpenBSD ifeq ($(OSTYPE),openbsd) ARTOOL=ar ARFLAGS=rcs +LWIPARCHINCLUDE=$(LWIPCONTRIBDIR)/ports/unix/include endif ############################################################################## @@ -224,7 +227,7 @@ STACK_DRIVER_DEFS+=-DLWIP_DONT_PROVIDE_BYTEORDER_FUNCTIONS STACK_DRIVER_DEFS+=-DSTACK_LWIP STACK_DRIVER_FILES:=src/lwIP.cpp LWIPDIR=ext/lwip/src -STACK_INCLUDES+=$(LWIPARCHINCLUDE) -Iext/lwip/src/include/lwip \ +STACK_INCLUDES+=-I$(LWIPARCHINCLUDE) -Iext/lwip/src/include/lwip \ -I$(LWIPDIR)/include \ -I$(LWIPARCH)/include \ -I$(LWIPDIR)/include/ipv4 \ diff --git a/ext/lwip-contrib/ports/unix/include/cc.h b/ext/lwip-contrib/ports/unix/include/arch/cc.h similarity index 100% rename from ext/lwip-contrib/ports/unix/include/cc.h rename to ext/lwip-contrib/ports/unix/include/arch/cc.h diff --git a/ext/lwip-contrib/ports/unix/include/perf.h b/ext/lwip-contrib/ports/unix/include/arch/perf.h similarity index 100% rename from ext/lwip-contrib/ports/unix/include/perf.h rename to ext/lwip-contrib/ports/unix/include/arch/perf.h diff --git a/ext/lwip-contrib/ports/unix/include/sys_arch.h b/ext/lwip-contrib/ports/unix/include/arch/sys_arch.h similarity index 100% rename from ext/lwip-contrib/ports/unix/include/sys_arch.h rename to ext/lwip-contrib/ports/unix/include/arch/sys_arch.h diff --git a/src/VirtualSocket.cpp b/src/VirtualSocket.cpp index 6cc872a..f19edc4 100644 --- a/src/VirtualSocket.cpp +++ b/src/VirtualSocket.cpp @@ -35,7 +35,7 @@ #include -#if defined(__linux__) || #defined(__APPLE__) +#if defined(__linux__) || defined(__APPLE__) #include #endif diff --git a/src/VirtualTap.hpp b/src/VirtualTap.hpp index 9fc3457..b1e5c30 100644 --- a/src/VirtualTap.hpp +++ b/src/VirtualTap.hpp @@ -119,6 +119,20 @@ namespace ZeroTier { void threadMain() throw(); +#if defined(__MINGW32__) + /* The following is merely to make ZeroTier's OneService happy while building on Windows. + we won't use these in libzt */ + NET_LUID _deviceLuid; + std::string _deviceInstanceId; + + /** + * Returns whether the VirtualTap interface has been initialized + */ + bool isInitialized() const { return _initialized; }; + + inline const NET_LUID &luid() const { return _deviceLuid; } + inline const std::string &instanceId() const { return _deviceInstanceId; } +#endif /** * For moving data onto the ZeroTier virtual wire */ From 35aa1820eff0758a5a2b86f45b0959512b4af665 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 12:15:10 -0700 Subject: [PATCH 03/11] Fixed signed comparison warnings when compiling under Windows --- src/ZT1Service.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/ZT1Service.cpp b/src/ZT1Service.cpp index 18dc10f..bff5ad5 100644 --- a/src/ZT1Service.cpp +++ b/src/ZT1Service.cpp @@ -69,7 +69,7 @@ ZeroTier::VirtualTap *getTapByNWID(uint64_t nwid) { ZeroTier::_vtaps_lock.lock(); ZeroTier::VirtualTap *s, *tap = nullptr; - for (int i=0; i_nwid == nwid) { tap = s; } } @@ -82,7 +82,7 @@ ZeroTier::VirtualTap *getTapByAddr(ZeroTier::InetAddress *addr) ZeroTier::_vtaps_lock.lock(); ZeroTier::VirtualTap *s, *tap = nullptr; //char ipbuf[64], ipbuf2[64], ipbuf3[64]; - for (int i=0; i_ips.size(); j++) { @@ -104,7 +104,7 @@ ZeroTier::VirtualTap *getTapByAddr(ZeroTier::InetAddress *addr) if (tap == NULL) { std::vector *managed_routes = ZeroTier::zt1Service->getRoutes(s->_nwid); ZeroTier::InetAddress target, nm, via; - for (int i=0; isize(); i++) { + for (size_t i=0; isize(); i++) { target = managed_routes->at(i).target; nm = target.netmask(); via = managed_routes->at(i).via; @@ -124,7 +124,7 @@ ZeroTier::VirtualTap *getTapByName(char *ifname) { ZeroTier::_vtaps_lock.lock(); ZeroTier::VirtualTap *s, *tap = nullptr; - for (int i=0; i_dev.c_str(), ifname) == false) { tap = s; @@ -134,11 +134,11 @@ ZeroTier::VirtualTap *getTapByName(char *ifname) return tap; } -ZeroTier::VirtualTap *getTapByIndex(int index) +ZeroTier::VirtualTap *getTapByIndex(size_t index) { ZeroTier::_vtaps_lock.lock(); ZeroTier::VirtualTap *s, *tap = nullptr; - for (int i=0; iifindex == index) { tap = s; @@ -247,7 +247,7 @@ void *zts_start_service(void *thread_id) void disableTaps() { ZeroTier::_vtaps_lock.lock(); - for (int i=0; i_enabled = false; } @@ -260,7 +260,7 @@ void zts_get_ipv4_address(const char *nwid, char *addrstr, const int addrlen) uint64_t nwid_int = strtoull(nwid, NULL, 16); ZeroTier::VirtualTap *tap = getTapByNWID(nwid_int); if (tap && tap->_ips.size()) { - for (int i=0; i_ips.size(); i++) { + for (size_t i=0; i_ips.size(); i++) { if (tap->_ips[i].isV4()) { char ipbuf[INET_ADDRSTRLEN]; std::string addr = tap->_ips[i].toString(ipbuf); @@ -282,7 +282,7 @@ void zts_get_ipv6_address(const char *nwid, char *addrstr, const int addrlen) uint64_t nwid_int = strtoull(nwid, NULL, 16); ZeroTier::VirtualTap *tap = getTapByNWID(nwid_int); if (tap && tap->_ips.size()) { - for (int i=0; i_ips.size(); i++) { + for (size_t i=0; i_ips.size(); i++) { if (tap->_ips[i].isV6()) { char ipbuf[INET6_ADDRSTRLEN]; std::string addr = tap->_ips[i].toString(ipbuf); @@ -351,7 +351,7 @@ void zts_join(const char * nwid) { } // provide ZTO service reference to virtual taps // TODO: This might prove to be unreliable, but it works for now - for (int i=0;izt1ServiceRef=(void*)ZeroTier::zt1Service; } @@ -470,7 +470,7 @@ int zts_get_peer_address(char *peer, const char *devID) { if (ZeroTier::zt1Service) { ZT_PeerList *pl = ZeroTier::zt1Service->getNode()->peers(); // uint64_t addr; - for (int i=0; ipeerCount; i++) { + for (size_t i=0; ipeerCount; i++) { // ZT_Peer *p = &(pl->peers[i]); // DEBUG_INFO("peer[%d] = %lx", i, p->address); } From f9754d82162d0e1a25fcab77dcf5ce3809b97cb4 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 12:22:57 -0700 Subject: [PATCH 04/11] zts_start initialization order fix for ztproxy. Was creating Phy instance before calling WSAStartup on Windows --- examples/ztproxy/ztproxy.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/ztproxy/ztproxy.cpp b/examples/ztproxy/ztproxy.cpp index d5a913c..07f5fbb 100644 --- a/examples/ztproxy/ztproxy.cpp +++ b/examples/ztproxy/ztproxy.cpp @@ -63,10 +63,6 @@ namespace ZeroTier { _internal_addr(internal_addr), _phy(this,false,true) { - // Start ZeroTier Node - // Join Network which contains resources we need to proxy - DEBUG_INFO("waiting for libzt to come online"); - zts_simple_start(path.c_str(), nwid.c_str()); // Set up TCP listen sockets // IPv4 struct sockaddr_in in4; @@ -380,6 +376,11 @@ int main(int argc, char **argv) int internal_port = atoi(argv[5]); std::string dns_nameserver= "";//argv[6]; + // Start ZeroTier Node + // Join Network which contains resources we need to proxy + DEBUG_INFO("waiting for libzt to come online"); + zts_simple_start(path.c_str(), nwid.c_str()); + ZeroTier::ZTProxy *proxy = new ZeroTier::ZTProxy(proxy_listen_port, nwid, path, internal_addr, internal_port, dns_nameserver); if (proxy) { From 27dddb2f459ab5ff37c1e8eb4644e8ceda8ecdcc Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 12:23:42 -0700 Subject: [PATCH 05/11] Added WSAStartup() and WSACleanup() calls to zts_start() and zts_stop() --- src/ZT1Service.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/ZT1Service.cpp b/src/ZT1Service.cpp index bff5ad5..d995a7a 100644 --- a/src/ZT1Service.cpp +++ b/src/ZT1Service.cpp @@ -55,6 +55,10 @@ namespace ZeroTier { ZeroTier::Mutex _multiplexer_lock; } +#if defined(__MINGW32__) || defined(__MINGW64__) +WSADATA wsaData; +#endif + /****************************************************************************/ /* ZeroTier Core helper functions for libzt - DON'T CALL THESE DIRECTLY */ /****************************************************************************/ @@ -395,6 +399,9 @@ void zts_start(const char *path) if (path) { ZeroTier::homeDir = path; } +#if defined(__MINGW32__) || defined(__MINGW64__) + WSAStartup(MAKEWORD(2, 2), &wsaData) // initialize WinSock. Used in Phy for loopback pipe +#endif pthread_t service_thread; pthread_create(&service_thread, NULL, zts_start_service, NULL); } @@ -426,6 +433,9 @@ void zts_stop() { ZeroTier::zt1Service->terminate(); disableTaps(); } +#if defined(__MINGW32__) || defined(__MINGW64__) + WSACleanup(); // clean up WinSock +#endif } void zts_get_homepath(char *homePath, int len) { From 2f59773b2691391da9236e6a9b5d32ad96887870 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 12:30:44 -0700 Subject: [PATCH 06/11] Updated ZTO submobule to f2c69ede for CancelSynchronousIo omission in Thread.hpp --- zto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zto b/zto index 59b7cbb..f2c69ed 160000 --- a/zto +++ b/zto @@ -1 +1 @@ -Subproject commit 59b7cbb591b8f9ed4abfc25773619d6b1bebc4d2 +Subproject commit f2c69ede9604f52da4aa885ae8d4d16a7f3fdabf From 427f87db913d7b8b6a3817f3293126892fdf8288 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 12:41:36 -0700 Subject: [PATCH 07/11] More fixes for Windows support --- src/VirtualTap.cpp | 1 + src/VirtualTap.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/VirtualTap.cpp b/src/VirtualTap.cpp index c33a0c0..1ac169b 100644 --- a/src/VirtualTap.cpp +++ b/src/VirtualTap.cpp @@ -84,6 +84,7 @@ namespace ZeroTier { _handler(handler), _homePath(homePath), _arg(arg), + _initialized(false), _enabled(true), _run(true), _mac(mac), diff --git a/src/VirtualTap.hpp b/src/VirtualTap.hpp index b1e5c30..58fb788 100644 --- a/src/VirtualTap.hpp +++ b/src/VirtualTap.hpp @@ -206,6 +206,7 @@ namespace ZeroTier { std::string _homePath; void *_arg; + volatile bool _initialized; volatile bool _enabled; volatile bool _run; MAC _mac; From 99dcce6d8b4cc30f5079b300282576818a171d27 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 12:48:37 -0700 Subject: [PATCH 08/11] Fixed signed comparison warnings in ztproxy when compiling under Windows --- examples/ztproxy/ztproxy.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ztproxy/ztproxy.cpp b/examples/ztproxy/ztproxy.cpp index 07f5fbb..f7ca154 100644 --- a/examples/ztproxy/ztproxy.cpp +++ b/examples/ztproxy/ztproxy.cpp @@ -129,7 +129,7 @@ namespace ZeroTier { FD_ZERO(&read_set); FD_ZERO(&write_set); nfds = 0; - for (int i=0; izfd, &read_set); FD_SET(clist[i]->zfd, &write_set); nfds = clist[i]->zfd > nfds ? clist[i]->zfd : nfds; @@ -324,7 +324,7 @@ namespace ZeroTier { zts_close(conn->zfd); } cmap.erase(sock); - for (int i=0; i Date: Tue, 10 Oct 2017 13:02:21 -0700 Subject: [PATCH 09/11] Fixed typo --- src/ZT1Service.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ZT1Service.cpp b/src/ZT1Service.cpp index d995a7a..085ba4a 100644 --- a/src/ZT1Service.cpp +++ b/src/ZT1Service.cpp @@ -400,7 +400,7 @@ void zts_start(const char *path) ZeroTier::homeDir = path; } #if defined(__MINGW32__) || defined(__MINGW64__) - WSAStartup(MAKEWORD(2, 2), &wsaData) // initialize WinSock. Used in Phy for loopback pipe + WSAStartup(MAKEWORD(2, 2), &wsaData); // initialize WinSock. Used in Phy for loopback pipe #endif pthread_t service_thread; pthread_create(&service_thread, NULL, zts_start_service, NULL); From 9b9aa108c22242689013c8d4f54cfd2fdee5541a Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 13:05:08 -0700 Subject: [PATCH 10/11] Added includes for various interface-related Windows defines --- src/VirtualTap.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/VirtualTap.hpp b/src/VirtualTap.hpp index 58fb788..6ad6a7c 100644 --- a/src/VirtualTap.hpp +++ b/src/VirtualTap.hpp @@ -41,6 +41,13 @@ #include "Thread.hpp" #include "Phy.hpp" +#if defined(__MINGW32__) || defined(__MINGW64__) +#include +#include +#include +#include +#endif + namespace ZeroTier { /** From 08b7ccb921bcfff1f52a5e9895f417979ad085a7 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Oct 2017 14:20:20 -0700 Subject: [PATCH 11/11] Added return values to zts_start() and zts_simple_start(). Also Fixed signed comparison warnings in ztproxy when compiling under Windows --- ext/picotcp/RFC/rfc0793.txt | 5247 +++++++++++++++++ ext/picotcp/RFC/rfc1066.txt | 5043 ++++++++++++++++ ext/picotcp/RFC/rfc1122.txt | 6844 +++++++++++++++++++++ ext/picotcp/RFC/rfc1123.txt | 5782 ++++++++++++++++++ ext/picotcp/RFC/rfc1323.txt | 2075 +++++++ ext/picotcp/RFC/rfc1379.txt | 2131 +++++++ ext/picotcp/RFC/rfc1470.txt | 10755 ++++++++++++++++++++++++++++++++++ ext/picotcp/RFC/rfc1644.txt | 2131 +++++++ ext/picotcp/RFC/rfc1661.txt | 2976 ++++++++++ ext/picotcp/RFC/rfc1693.txt | 2019 +++++++ ext/picotcp/RFC/rfc2026.txt | 2019 +++++++ ext/picotcp/RFC/rfc2131.txt | 2523 ++++++++ ext/picotcp/RFC/rfc2460.txt | 2187 +++++++ ext/picotcp/RFC/rfc2525.txt | 3419 +++++++++++ ext/picotcp/RFC/rfc2757.txt | 2579 ++++++++ ext/picotcp/RFC/rfc2760.txt | 2579 ++++++++ ext/picotcp/RFC/rfc3135.txt | 2523 ++++++++ ext/picotcp/RFC/rfc3168.txt | 3531 +++++++++++ ext/picotcp/RFC/rfc3449.txt | 2299 ++++++++ ext/picotcp/RFC/rfc3493.txt | 2187 +++++++ ext/picotcp/RFC/rfc3649.txt | 1907 ++++++ ext/picotcp/RFC/rfc3819.txt | 3363 +++++++++++ ext/picotcp/RFC/rfc3927.txt | 1851 ++++++ ext/picotcp/RFC/rfc4614.txt | 1851 ++++++ ext/picotcp/RFC/rfc6762.txt | 3923 +++++++++++++ include/ZT1Service.h | 16 +- include/libzt.h | 11 +- src/VirtualTap.hpp | 2 +- src/ZT1Service.cpp | 19 +- 29 files changed, 83768 insertions(+), 24 deletions(-) create mode 100644 ext/picotcp/RFC/rfc0793.txt create mode 100644 ext/picotcp/RFC/rfc1066.txt create mode 100644 ext/picotcp/RFC/rfc1122.txt create mode 100644 ext/picotcp/RFC/rfc1123.txt create mode 100644 ext/picotcp/RFC/rfc1323.txt create mode 100644 ext/picotcp/RFC/rfc1379.txt create mode 100644 ext/picotcp/RFC/rfc1470.txt create mode 100644 ext/picotcp/RFC/rfc1644.txt create mode 100644 ext/picotcp/RFC/rfc1661.txt create mode 100644 ext/picotcp/RFC/rfc1693.txt create mode 100644 ext/picotcp/RFC/rfc2026.txt create mode 100644 ext/picotcp/RFC/rfc2131.txt create mode 100644 ext/picotcp/RFC/rfc2460.txt create mode 100644 ext/picotcp/RFC/rfc2525.txt create mode 100644 ext/picotcp/RFC/rfc2757.txt create mode 100644 ext/picotcp/RFC/rfc2760.txt create mode 100644 ext/picotcp/RFC/rfc3135.txt create mode 100644 ext/picotcp/RFC/rfc3168.txt create mode 100644 ext/picotcp/RFC/rfc3449.txt create mode 100644 ext/picotcp/RFC/rfc3493.txt create mode 100644 ext/picotcp/RFC/rfc3649.txt create mode 100644 ext/picotcp/RFC/rfc3819.txt create mode 100644 ext/picotcp/RFC/rfc3927.txt create mode 100644 ext/picotcp/RFC/rfc4614.txt create mode 100644 ext/picotcp/RFC/rfc6762.txt diff --git a/ext/picotcp/RFC/rfc0793.txt b/ext/picotcp/RFC/rfc0793.txt new file mode 100644 index 0000000..603a78c --- /dev/null +++ b/ext/picotcp/RFC/rfc0793.txt @@ -0,0 +1,5247 @@ + + +RFC: 793 + + + + + + + + TRANSMISSION CONTROL PROTOCOL + + + DARPA INTERNET PROGRAM + + PROTOCOL SPECIFICATION + + + + September 1981 + + + + + + + + + + + + + + prepared for + + Defense Advanced Research Projects Agency + Information Processing Techniques Office + 1400 Wilson Boulevard + Arlington, Virginia 22209 + + + + + + + + by + + Information Sciences Institute + University of Southern California + 4676 Admiralty Way + Marina del Rey, California 90291 + + + +September 1981 + Transmission Control Protocol + + + + TABLE OF CONTENTS + + PREFACE ........................................................ iii + +1. INTRODUCTION ..................................................... 1 + + 1.1 Motivation .................................................... 1 + 1.2 Scope ......................................................... 2 + 1.3 About This Document ........................................... 2 + 1.4 Interfaces .................................................... 3 + 1.5 Operation ..................................................... 3 + +2. PHILOSOPHY ....................................................... 7 + + 2.1 Elements of the Internetwork System ........................... 7 + 2.2 Model of Operation ............................................ 7 + 2.3 The Host Environment .......................................... 8 + 2.4 Interfaces .................................................... 9 + 2.5 Relation to Other Protocols ................................... 9 + 2.6 Reliable Communication ........................................ 9 + 2.7 Connection Establishment and Clearing ........................ 10 + 2.8 Data Communication ........................................... 12 + 2.9 Precedence and Security ...................................... 13 + 2.10 Robustness Principle ......................................... 13 + +3. FUNCTIONAL SPECIFICATION ........................................ 15 + + 3.1 Header Format ................................................ 15 + 3.2 Terminology .................................................. 19 + 3.3 Sequence Numbers ............................................. 24 + 3.4 Establishing a connection .................................... 30 + 3.5 Closing a Connection ......................................... 37 + 3.6 Precedence and Security ...................................... 40 + 3.7 Data Communication ........................................... 40 + 3.8 Interfaces ................................................... 44 + 3.9 Event Processing ............................................. 52 + +GLOSSARY ............................................................ 79 + +REFERENCES .......................................................... 85 + + + + + + + + + + + + [Page i] + + + September 1981 +Transmission Control Protocol + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[Page ii] + + +September 1981 + Transmission Control Protocol + + + + PREFACE + + + +This document describes the DoD Standard Transmission Control Protocol +(TCP). There have been nine earlier editions of the ARPA TCP +specification on which this standard is based, and the present text +draws heavily from them. There have been many contributors to this work +both in terms of concepts and in terms of text. This edition clarifies +several details and removes the end-of-letter buffer-size adjustments, +and redescribes the letter mechanism as a push function. + + Jon Postel + + Editor + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page iii] + + + + +RFC: 793 +Replaces: RFC 761 +IENs: 129, 124, 112, 81, +55, 44, 40, 27, 21, 5 + + TRANSMISSION CONTROL PROTOCOL + + DARPA INTERNET PROGRAM + PROTOCOL SPECIFICATION + + + + 1. INTRODUCTION + +The Transmission Control Protocol (TCP) is intended for use as a highly +reliable host-to-host protocol between hosts in packet-switched computer +communication networks, and in interconnected systems of such networks. + +This document describes the functions to be performed by the +Transmission Control Protocol, the program that implements it, and its +interface to programs or users that require its services. + +1.1. Motivation + + Computer communication systems are playing an increasingly important + role in military, government, and civilian environments. This + document focuses its attention primarily on military computer + communication requirements, especially robustness in the presence of + communication unreliability and availability in the presence of + congestion, but many of these problems are found in the civilian and + government sector as well. + + As strategic and tactical computer communication networks are + developed and deployed, it is essential to provide means of + interconnecting them and to provide standard interprocess + communication protocols which can support a broad range of + applications. In anticipation of the need for such standards, the + Deputy Undersecretary of Defense for Research and Engineering has + declared the Transmission Control Protocol (TCP) described herein to + be a basis for DoD-wide inter-process communication protocol + standardization. + + TCP is a connection-oriented, end-to-end reliable protocol designed to + fit into a layered hierarchy of protocols which support multi-network + applications. The TCP provides for reliable inter-process + communication between pairs of processes in host computers attached to + distinct but interconnected computer communication networks. Very few + assumptions are made as to the reliability of the communication + protocols below the TCP layer. TCP assumes it can obtain a simple, + potentially unreliable datagram service from the lower level + protocols. In principle, the TCP should be able to operate above a + wide spectrum of communication systems ranging from hard-wired + connections to packet-switched or circuit-switched networks. + + + [Page 1] + + + September 1981 +Transmission Control Protocol +Introduction + + + + TCP is based on concepts first described by Cerf and Kahn in [1]. The + TCP fits into a layered protocol architecture just above a basic + Internet Protocol [2] which provides a way for the TCP to send and + receive variable-length segments of information enclosed in internet + datagram "envelopes". The internet datagram provides a means for + addressing source and destination TCPs in different networks. The + internet protocol also deals with any fragmentation or reassembly of + the TCP segments required to achieve transport and delivery through + multiple networks and interconnecting gateways. The internet protocol + also carries information on the precedence, security classification + and compartmentation of the TCP segments, so this information can be + communicated end-to-end across multiple networks. + + Protocol Layering + + +---------------------+ + | higher-level | + +---------------------+ + | TCP | + +---------------------+ + | internet protocol | + +---------------------+ + |communication network| + +---------------------+ + + Figure 1 + + Much of this document is written in the context of TCP implementations + which are co-resident with higher level protocols in the host + computer. Some computer systems will be connected to networks via + front-end computers which house the TCP and internet protocol layers, + as well as network specific software. The TCP specification describes + an interface to the higher level protocols which appears to be + implementable even for the front-end case, as long as a suitable + host-to-front end protocol is implemented. + +1.2. Scope + + The TCP is intended to provide a reliable process-to-process + communication service in a multinetwork environment. The TCP is + intended to be a host-to-host protocol in common use in multiple + networks. + +1.3. About this Document + + This document represents a specification of the behavior required of + any TCP implementation, both in its interactions with higher level + protocols and in its interactions with other TCPs. The rest of this + + +[Page 2] + + +September 1981 + Transmission Control Protocol + Introduction + + + + section offers a very brief view of the protocol interfaces and + operation. Section 2 summarizes the philosophical basis for the TCP + design. Section 3 offers both a detailed description of the actions + required of TCP when various events occur (arrival of new segments, + user calls, errors, etc.) and the details of the formats of TCP + segments. + +1.4. Interfaces + + The TCP interfaces on one side to user or application processes and on + the other side to a lower level protocol such as Internet Protocol. + + The interface between an application process and the TCP is + illustrated in reasonable detail. This interface consists of a set of + calls much like the calls an operating system provides to an + application process for manipulating files. For example, there are + calls to open and close connections and to send and receive data on + established connections. It is also expected that the TCP can + asynchronously communicate with application programs. Although + considerable freedom is permitted to TCP implementors to design + interfaces which are appropriate to a particular operating system + environment, a minimum functionality is required at the TCP/user + interface for any valid implementation. + + The interface between TCP and lower level protocol is essentially + unspecified except that it is assumed there is a mechanism whereby the + two levels can asynchronously pass information to each other. + Typically, one expects the lower level protocol to specify this + interface. TCP is designed to work in a very general environment of + interconnected networks. The lower level protocol which is assumed + throughout this document is the Internet Protocol [2]. + +1.5. Operation + + As noted above, the primary purpose of the TCP is to provide reliable, + securable logical circuit or connection service between pairs of + processes. To provide this service on top of a less reliable internet + communication system requires facilities in the following areas: + + Basic Data Transfer + Reliability + Flow Control + Multiplexing + Connections + Precedence and Security + + The basic operation of the TCP in each of these areas is described in + the following paragraphs. + + + [Page 3] + + + September 1981 +Transmission Control Protocol +Introduction + + + + Basic Data Transfer: + + The TCP is able to transfer a continuous stream of octets in each + direction between its users by packaging some number of octets into + segments for transmission through the internet system. In general, + the TCPs decide when to block and forward data at their own + convenience. + + Sometimes users need to be sure that all the data they have + submitted to the TCP has been transmitted. For this purpose a push + function is defined. To assure that data submitted to a TCP is + actually transmitted the sending user indicates that it should be + pushed through to the receiving user. A push causes the TCPs to + promptly forward and deliver data up to that point to the receiver. + The exact push point might not be visible to the receiving user and + the push function does not supply a record boundary marker. + + Reliability: + + The TCP must recover from data that is damaged, lost, duplicated, or + delivered out of order by the internet communication system. This + is achieved by assigning a sequence number to each octet + transmitted, and requiring a positive acknowledgment (ACK) from the + receiving TCP. If the ACK is not received within a timeout + interval, the data is retransmitted. At the receiver, the sequence + numbers are used to correctly order segments that may be received + out of order and to eliminate duplicates. Damage is handled by + adding a checksum to each segment transmitted, checking it at the + receiver, and discarding damaged segments. + + As long as the TCPs continue to function properly and the internet + system does not become completely partitioned, no transmission + errors will affect the correct delivery of data. TCP recovers from + internet communication system errors. + + Flow Control: + + TCP provides a means for the receiver to govern the amount of data + sent by the sender. This is achieved by returning a "window" with + every ACK indicating a range of acceptable sequence numbers beyond + the last segment successfully received. The window indicates an + allowed number of octets that the sender may transmit before + receiving further permission. + + + + + + + +[Page 4] + + +September 1981 + Transmission Control Protocol + Introduction + + + + Multiplexing: + + To allow for many processes within a single Host to use TCP + communication facilities simultaneously, the TCP provides a set of + addresses or ports within each host. Concatenated with the network + and host addresses from the internet communication layer, this forms + a socket. A pair of sockets uniquely identifies each connection. + That is, a socket may be simultaneously used in multiple + connections. + + The binding of ports to processes is handled independently by each + Host. However, it proves useful to attach frequently used processes + (e.g., a "logger" or timesharing service) to fixed sockets which are + made known to the public. These services can then be accessed + through the known addresses. Establishing and learning the port + addresses of other processes may involve more dynamic mechanisms. + + Connections: + + The reliability and flow control mechanisms described above require + that TCPs initialize and maintain certain status information for + each data stream. The combination of this information, including + sockets, sequence numbers, and window sizes, is called a connection. + Each connection is uniquely specified by a pair of sockets + identifying its two sides. + + When two processes wish to communicate, their TCP's must first + establish a connection (initialize the status information on each + side). When their communication is complete, the connection is + terminated or closed to free the resources for other uses. + + Since connections must be established between unreliable hosts and + over the unreliable internet communication system, a handshake + mechanism with clock-based sequence numbers is used to avoid + erroneous initialization of connections. + + Precedence and Security: + + The users of TCP may indicate the security and precedence of their + communication. Provision is made for default values to be used when + these features are not needed. + + + + + + + + + + [Page 5] + + + September 1981 +Transmission Control Protocol + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[Page 6] + + +September 1981 + Transmission Control Protocol + + + + 2. PHILOSOPHY + +2.1. Elements of the Internetwork System + + The internetwork environment consists of hosts connected to networks + which are in turn interconnected via gateways. It is assumed here + that the networks may be either local networks (e.g., the ETHERNET) or + large networks (e.g., the ARPANET), but in any case are based on + packet switching technology. The active agents that produce and + consume messages are processes. Various levels of protocols in the + networks, the gateways, and the hosts support an interprocess + communication system that provides two-way data flow on logical + connections between process ports. + + The term packet is used generically here to mean the data of one + transaction between a host and its network. The format of data blocks + exchanged within the a network will generally not be of concern to us. + + Hosts are computers attached to a network, and from the communication + network's point of view, are the sources and destinations of packets. + Processes are viewed as the active elements in host computers (in + accordance with the fairly common definition of a process as a program + in execution). Even terminals and files or other I/O devices are + viewed as communicating with each other through the use of processes. + Thus, all communication is viewed as inter-process communication. + + Since a process may need to distinguish among several communication + streams between itself and another process (or processes), we imagine + that each process may have a number of ports through which it + communicates with the ports of other processes. + +2.2. Model of Operation + + Processes transmit data by calling on the TCP and passing buffers of + data as arguments. The TCP packages the data from these buffers into + segments and calls on the internet module to transmit each segment to + the destination TCP. The receiving TCP places the data from a segment + into the receiving user's buffer and notifies the receiving user. The + TCPs include control information in the segments which they use to + ensure reliable ordered data transmission. + + The model of internet communication is that there is an internet + protocol module associated with each TCP which provides an interface + to the local network. This internet module packages TCP segments + inside internet datagrams and routes these datagrams to a destination + internet module or intermediate gateway. To transmit the datagram + through the local network, it is embedded in a local network packet. + + The packet switches may perform further packaging, fragmentation, or + + + [Page 7] + + + September 1981 +Transmission Control Protocol +Philosophy + + + + other operations to achieve the delivery of the local packet to the + destination internet module. + + At a gateway between networks, the internet datagram is "unwrapped" + from its local packet and examined to determine through which network + the internet datagram should travel next. The internet datagram is + then "wrapped" in a local packet suitable to the next network and + routed to the next gateway, or to the final destination. + + A gateway is permitted to break up an internet datagram into smaller + internet datagram fragments if this is necessary for transmission + through the next network. To do this, the gateway produces a set of + internet datagrams; each carrying a fragment. Fragments may be + further broken into smaller fragments at subsequent gateways. The + internet datagram fragment format is designed so that the destination + internet module can reassemble fragments into internet datagrams. + + A destination internet module unwraps the segment from the datagram + (after reassembling the datagram, if necessary) and passes it to the + destination TCP. + + This simple model of the operation glosses over many details. One + important feature is the type of service. This provides information + to the gateway (or internet module) to guide it in selecting the + service parameters to be used in traversing the next network. + Included in the type of service information is the precedence of the + datagram. Datagrams may also carry security information to permit + host and gateways that operate in multilevel secure environments to + properly segregate datagrams for security considerations. + +2.3. The Host Environment + + The TCP is assumed to be a module in an operating system. The users + access the TCP much like they would access the file system. The TCP + may call on other operating system functions, for example, to manage + data structures. The actual interface to the network is assumed to be + controlled by a device driver module. The TCP does not call on the + network device driver directly, but rather calls on the internet + datagram protocol module which may in turn call on the device driver. + + The mechanisms of TCP do not preclude implementation of the TCP in a + front-end processor. However, in such an implementation, a + host-to-front-end protocol must provide the functionality to support + the type of TCP-user interface described in this document. + + + + + + +[Page 8] + + +September 1981 + Transmission Control Protocol + Philosophy + + + +2.4. Interfaces + + The TCP/user interface provides for calls made by the user on the TCP + to OPEN or CLOSE a connection, to SEND or RECEIVE data, or to obtain + STATUS about a connection. These calls are like other calls from user + programs on the operating system, for example, the calls to open, read + from, and close a file. + + The TCP/internet interface provides calls to send and receive + datagrams addressed to TCP modules in hosts anywhere in the internet + system. These calls have parameters for passing the address, type of + service, precedence, security, and other control information. + +2.5. Relation to Other Protocols + + The following diagram illustrates the place of the TCP in the protocol + hierarchy: + + + +------+ +-----+ +-----+ +-----+ + |Telnet| | FTP | |Voice| ... | | Application Level + +------+ +-----+ +-----+ +-----+ + | | | | + +-----+ +-----+ +-----+ + | TCP | | RTP | ... | | Host Level + +-----+ +-----+ +-----+ + | | | + +-------------------------------+ + | Internet Protocol & ICMP | Gateway Level + +-------------------------------+ + | + +---------------------------+ + | Local Network Protocol | Network Level + +---------------------------+ + + Protocol Relationships + + Figure 2. + + It is expected that the TCP will be able to support higher level + protocols efficiently. It should be easy to interface higher level + protocols like the ARPANET Telnet or AUTODIN II THP to the TCP. + +2.6. Reliable Communication + + A stream of data sent on a TCP connection is delivered reliably and in + order at the destination. + + + + [Page 9] + + + September 1981 +Transmission Control Protocol +Philosophy + + + + Transmission is made reliable via the use of sequence numbers and + acknowledgments. Conceptually, each octet of data is assigned a + sequence number. The sequence number of the first octet of data in a + segment is transmitted with that segment and is called the segment + sequence number. Segments also carry an acknowledgment number which + is the sequence number of the next expected data octet of + transmissions in the reverse direction. When the TCP transmits a + segment containing data, it puts a copy on a retransmission queue and + starts a timer; when the acknowledgment for that data is received, the + segment is deleted from the queue. If the acknowledgment is not + received before the timer runs out, the segment is retransmitted. + + An acknowledgment by TCP does not guarantee that the data has been + delivered to the end user, but only that the receiving TCP has taken + the responsibility to do so. + + To govern the flow of data between TCPs, a flow control mechanism is + employed. The receiving TCP reports a "window" to the sending TCP. + This window specifies the number of octets, starting with the + acknowledgment number, that the receiving TCP is currently prepared to + receive. + +2.7. Connection Establishment and Clearing + + To identify the separate data streams that a TCP may handle, the TCP + provides a port identifier. Since port identifiers are selected + independently by each TCP they might not be unique. To provide for + unique addresses within each TCP, we concatenate an internet address + identifying the TCP with a port identifier to create a socket which + will be unique throughout all networks connected together. + + A connection is fully specified by the pair of sockets at the ends. A + local socket may participate in many connections to different foreign + sockets. A connection can be used to carry data in both directions, + that is, it is "full duplex". + + TCPs are free to associate ports with processes however they choose. + However, several basic concepts are necessary in any implementation. + There must be well-known sockets which the TCP associates only with + the "appropriate" processes by some means. We envision that processes + may "own" ports, and that processes can initiate connections only on + the ports they own. (Means for implementing ownership is a local + issue, but we envision a Request Port user command, or a method of + uniquely allocating a group of ports to a given process, e.g., by + associating the high order bits of a port name with a given process.) + + A connection is specified in the OPEN call by the local port and + foreign socket arguments. In return, the TCP supplies a (short) local + + +[Page 10] + + +September 1981 + Transmission Control Protocol + Philosophy + + + + connection name by which the user refers to the connection in + subsequent calls. There are several things that must be remembered + about a connection. To store this information we imagine that there + is a data structure called a Transmission Control Block (TCB). One + implementation strategy would have the local connection name be a + pointer to the TCB for this connection. The OPEN call also specifies + whether the connection establishment is to be actively pursued, or to + be passively waited for. + + A passive OPEN request means that the process wants to accept incoming + connection requests rather than attempting to initiate a connection. + Often the process requesting a passive OPEN will accept a connection + request from any caller. In this case a foreign socket of all zeros + is used to denote an unspecified socket. Unspecified foreign sockets + are allowed only on passive OPENs. + + A service process that wished to provide services for unknown other + processes would issue a passive OPEN request with an unspecified + foreign socket. Then a connection could be made with any process that + requested a connection to this local socket. It would help if this + local socket were known to be associated with this service. + + Well-known sockets are a convenient mechanism for a priori associating + a socket address with a standard service. For instance, the + "Telnet-Server" process is permanently assigned to a particular + socket, and other sockets are reserved for File Transfer, Remote Job + Entry, Text Generator, Echoer, and Sink processes (the last three + being for test purposes). A socket address might be reserved for + access to a "Look-Up" service which would return the specific socket + at which a newly created service would be provided. The concept of a + well-known socket is part of the TCP specification, but the assignment + of sockets to services is outside this specification. (See [4].) + + Processes can issue passive OPENs and wait for matching active OPENs + from other processes and be informed by the TCP when connections have + been established. Two processes which issue active OPENs to each + other at the same time will be correctly connected. This flexibility + is critical for the support of distributed computing in which + components act asynchronously with respect to each other. + + There are two principal cases for matching the sockets in the local + passive OPENs and an foreign active OPENs. In the first case, the + local passive OPENs has fully specified the foreign socket. In this + case, the match must be exact. In the second case, the local passive + OPENs has left the foreign socket unspecified. In this case, any + foreign socket is acceptable as long as the local sockets match. + Other possibilities include partially restricted matches. + + + + [Page 11] + + + September 1981 +Transmission Control Protocol +Philosophy + + + + If there are several pending passive OPENs (recorded in TCBs) with the + same local socket, an foreign active OPEN will be matched to a TCB + with the specific foreign socket in the foreign active OPEN, if such a + TCB exists, before selecting a TCB with an unspecified foreign socket. + + The procedures to establish connections utilize the synchronize (SYN) + control flag and involves an exchange of three messages. This + exchange has been termed a three-way hand shake [3]. + + A connection is initiated by the rendezvous of an arriving segment + containing a SYN and a waiting TCB entry each created by a user OPEN + command. The matching of local and foreign sockets determines when a + connection has been initiated. The connection becomes "established" + when sequence numbers have been synchronized in both directions. + + The clearing of a connection also involves the exchange of segments, + in this case carrying the FIN control flag. + +2.8. Data Communication + + The data that flows on a connection may be thought of as a stream of + octets. The sending user indicates in each SEND call whether the data + in that call (and any preceeding calls) should be immediately pushed + through to the receiving user by the setting of the PUSH flag. + + A sending TCP is allowed to collect data from the sending user and to + send that data in segments at its own convenience, until the push + function is signaled, then it must send all unsent data. When a + receiving TCP sees the PUSH flag, it must not wait for more data from + the sending TCP before passing the data to the receiving process. + + There is no necessary relationship between push functions and segment + boundaries. The data in any particular segment may be the result of a + single SEND call, in whole or part, or of multiple SEND calls. + + The purpose of push function and the PUSH flag is to push data through + from the sending user to the receiving user. It does not provide a + record service. + + There is a coupling between the push function and the use of buffers + of data that cross the TCP/user interface. Each time a PUSH flag is + associated with data placed into the receiving user's buffer, the + buffer is returned to the user for processing even if the buffer is + not filled. If data arrives that fills the user's buffer before a + PUSH is seen, the data is passed to the user in buffer size units. + + TCP also provides a means to communicate to the receiver of data that + at some point further along in the data stream than the receiver is + + +[Page 12] + + +September 1981 + Transmission Control Protocol + Philosophy + + + + currently reading there is urgent data. TCP does not attempt to + define what the user specifically does upon being notified of pending + urgent data, but the general notion is that the receiving process will + take action to process the urgent data quickly. + +2.9. Precedence and Security + + The TCP makes use of the internet protocol type of service field and + security option to provide precedence and security on a per connection + basis to TCP users. Not all TCP modules will necessarily function in + a multilevel secure environment; some may be limited to unclassified + use only, and others may operate at only one security level and + compartment. Consequently, some TCP implementations and services to + users may be limited to a subset of the multilevel secure case. + + TCP modules which operate in a multilevel secure environment must + properly mark outgoing segments with the security, compartment, and + precedence. Such TCP modules must also provide to their users or + higher level protocols such as Telnet or THP an interface to allow + them to specify the desired security level, compartment, and + precedence of connections. + +2.10. Robustness Principle + + TCP implementations will follow a general principle of robustness: be + conservative in what you do, be liberal in what you accept from + others. + + + + + + + + + + + + + + + + + + + + + + + + [Page 13] + + + September 1981 +Transmission Control Protocol + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[Page 14] + + +September 1981 + Transmission Control Protocol + + + + 3. FUNCTIONAL SPECIFICATION + +3.1. Header Format + + TCP segments are sent as internet datagrams. The Internet Protocol + header carries several information fields, including the source and + destination host addresses [2]. A TCP header follows the internet + header, supplying information specific to the TCP protocol. This + division allows for the existence of host level protocols other than + TCP. + + TCP Header Format + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Source Port | Destination Port | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Sequence Number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Acknowledgment Number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Data | |U|A|P|R|S|F| | + | Offset| Reserved |R|C|S|S|Y|I| Window | + | | |G|K|H|T|N|N| | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Checksum | Urgent Pointer | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Options | Padding | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + TCP Header Format + + Note that one tick mark represents one bit position. + + Figure 3. + + Source Port: 16 bits + + The source port number. + + Destination Port: 16 bits + + The destination port number. + + + + + [Page 15] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + Sequence Number: 32 bits + + The sequence number of the first data octet in this segment (except + when SYN is present). If SYN is present the sequence number is the + initial sequence number (ISN) and the first data octet is ISN+1. + + Acknowledgment Number: 32 bits + + If the ACK control bit is set this field contains the value of the + next sequence number the sender of the segment is expecting to + receive. Once a connection is established this is always sent. + + Data Offset: 4 bits + + The number of 32 bit words in the TCP Header. This indicates where + the data begins. The TCP header (even one including options) is an + integral number of 32 bits long. + + Reserved: 6 bits + + Reserved for future use. Must be zero. + + Control Bits: 6 bits (from left to right): + + URG: Urgent Pointer field significant + ACK: Acknowledgment field significant + PSH: Push Function + RST: Reset the connection + SYN: Synchronize sequence numbers + FIN: No more data from sender + + Window: 16 bits + + The number of data octets beginning with the one indicated in the + acknowledgment field which the sender of this segment is willing to + accept. + + Checksum: 16 bits + + The checksum field is the 16 bit one's complement of the one's + complement sum of all 16 bit words in the header and text. If a + segment contains an odd number of header and text octets to be + checksummed, the last octet is padded on the right with zeros to + form a 16 bit word for checksum purposes. The pad is not + transmitted as part of the segment. While computing the checksum, + the checksum field itself is replaced with zeros. + + The checksum also covers a 96 bit pseudo header conceptually + + +[Page 16] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + prefixed to the TCP header. This pseudo header contains the Source + Address, the Destination Address, the Protocol, and TCP length. + This gives the TCP protection against misrouted segments. This + information is carried in the Internet Protocol and is transferred + across the TCP/Network interface in the arguments or results of + calls by the TCP on the IP. + + +--------+--------+--------+--------+ + | Source Address | + +--------+--------+--------+--------+ + | Destination Address | + +--------+--------+--------+--------+ + | zero | PTCL | TCP Length | + +--------+--------+--------+--------+ + + The TCP Length is the TCP header length plus the data length in + octets (this is not an explicitly transmitted quantity, but is + computed), and it does not count the 12 octets of the pseudo + header. + + Urgent Pointer: 16 bits + + This field communicates the current value of the urgent pointer as a + positive offset from the sequence number in this segment. The + urgent pointer points to the sequence number of the octet following + the urgent data. This field is only be interpreted in segments with + the URG control bit set. + + Options: variable + + Options may occupy space at the end of the TCP header and are a + multiple of 8 bits in length. All options are included in the + checksum. An option may begin on any octet boundary. There are two + cases for the format of an option: + + Case 1: A single octet of option-kind. + + Case 2: An octet of option-kind, an octet of option-length, and + the actual option-data octets. + + The option-length counts the two octets of option-kind and + option-length as well as the option-data octets. + + Note that the list of options may be shorter than the data offset + field might imply. The content of the header beyond the + End-of-Option option must be header padding (i.e., zero). + + A TCP must implement all options. + + + [Page 17] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + Currently defined options include (kind indicated in octal): + + Kind Length Meaning + ---- ------ ------- + 0 - End of option list. + 1 - No-Operation. + 2 4 Maximum Segment Size. + + + Specific Option Definitions + + End of Option List + + +--------+ + |00000000| + +--------+ + Kind=0 + + This option code indicates the end of the option list. This + might not coincide with the end of the TCP header according to + the Data Offset field. This is used at the end of all options, + not the end of each option, and need only be used if the end of + the options would not otherwise coincide with the end of the TCP + header. + + No-Operation + + +--------+ + |00000001| + +--------+ + Kind=1 + + This option code may be used between options, for example, to + align the beginning of a subsequent option on a word boundary. + There is no guarantee that senders will use this option, so + receivers must be prepared to process options even if they do + not begin on a word boundary. + + Maximum Segment Size + + +--------+--------+---------+--------+ + |00000010|00000100| max seg size | + +--------+--------+---------+--------+ + Kind=2 Length=4 + + + + + + +[Page 18] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + Maximum Segment Size Option Data: 16 bits + + If this option is present, then it communicates the maximum + receive segment size at the TCP which sends this segment. + This field must only be sent in the initial connection request + (i.e., in segments with the SYN control bit set). If this + option is not used, any segment size is allowed. + + Padding: variable + + The TCP header padding is used to ensure that the TCP header ends + and data begins on a 32 bit boundary. The padding is composed of + zeros. + +3.2. Terminology + + Before we can discuss very much about the operation of the TCP we need + to introduce some detailed terminology. The maintenance of a TCP + connection requires the remembering of several variables. We conceive + of these variables being stored in a connection record called a + Transmission Control Block or TCB. Among the variables stored in the + TCB are the local and remote socket numbers, the security and + precedence of the connection, pointers to the user's send and receive + buffers, pointers to the retransmit queue and to the current segment. + In addition several variables relating to the send and receive + sequence numbers are stored in the TCB. + + Send Sequence Variables + + SND.UNA - send unacknowledged + SND.NXT - send next + SND.WND - send window + SND.UP - send urgent pointer + SND.WL1 - segment sequence number used for last window update + SND.WL2 - segment acknowledgment number used for last window + update + ISS - initial send sequence number + + Receive Sequence Variables + + RCV.NXT - receive next + RCV.WND - receive window + RCV.UP - receive urgent pointer + IRS - initial receive sequence number + + + + + + + [Page 19] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + The following diagrams may help to relate some of these variables to + the sequence space. + + Send Sequence Space + + 1 2 3 4 + ----------|----------|----------|---------- + SND.UNA SND.NXT SND.UNA + +SND.WND + + 1 - old sequence numbers which have been acknowledged + 2 - sequence numbers of unacknowledged data + 3 - sequence numbers allowed for new data transmission + 4 - future sequence numbers which are not yet allowed + + Send Sequence Space + + Figure 4. + + + + The send window is the portion of the sequence space labeled 3 in + figure 4. + + Receive Sequence Space + + 1 2 3 + ----------|----------|---------- + RCV.NXT RCV.NXT + +RCV.WND + + 1 - old sequence numbers which have been acknowledged + 2 - sequence numbers allowed for new reception + 3 - future sequence numbers which are not yet allowed + + Receive Sequence Space + + Figure 5. + + + + The receive window is the portion of the sequence space labeled 2 in + figure 5. + + There are also some variables used frequently in the discussion that + take their values from the fields of the current segment. + + + + +[Page 20] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + Current Segment Variables + + SEG.SEQ - segment sequence number + SEG.ACK - segment acknowledgment number + SEG.LEN - segment length + SEG.WND - segment window + SEG.UP - segment urgent pointer + SEG.PRC - segment precedence value + + A connection progresses through a series of states during its + lifetime. The states are: LISTEN, SYN-SENT, SYN-RECEIVED, + ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, + TIME-WAIT, and the fictional state CLOSED. CLOSED is fictional + because it represents the state when there is no TCB, and therefore, + no connection. Briefly the meanings of the states are: + + LISTEN - represents waiting for a connection request from any remote + TCP and port. + + SYN-SENT - represents waiting for a matching connection request + after having sent a connection request. + + SYN-RECEIVED - represents waiting for a confirming connection + request acknowledgment after having both received and sent a + connection request. + + ESTABLISHED - represents an open connection, data received can be + delivered to the user. The normal state for the data transfer phase + of the connection. + + FIN-WAIT-1 - represents waiting for a connection termination request + from the remote TCP, or an acknowledgment of the connection + termination request previously sent. + + FIN-WAIT-2 - represents waiting for a connection termination request + from the remote TCP. + + CLOSE-WAIT - represents waiting for a connection termination request + from the local user. + + CLOSING - represents waiting for a connection termination request + acknowledgment from the remote TCP. + + LAST-ACK - represents waiting for an acknowledgment of the + connection termination request previously sent to the remote TCP + (which includes an acknowledgment of its connection termination + request). + + + + [Page 21] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + TIME-WAIT - represents waiting for enough time to pass to be sure + the remote TCP received the acknowledgment of its connection + termination request. + + CLOSED - represents no connection state at all. + + A TCP connection progresses from one state to another in response to + events. The events are the user calls, OPEN, SEND, RECEIVE, CLOSE, + ABORT, and STATUS; the incoming segments, particularly those + containing the SYN, ACK, RST and FIN flags; and timeouts. + + The state diagram in figure 6 illustrates only state changes, together + with the causing events and resulting actions, but addresses neither + error conditions nor actions which are not connected with state + changes. In a later section, more detail is offered with respect to + the reaction of the TCP to events. + + NOTE BENE: this diagram is only a summary and must not be taken as + the total specification. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[Page 22] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + + +---------+ ---------\ active OPEN + | CLOSED | \ ----------- + +---------+<---------\ \ create TCB + | ^ \ \ snd SYN + passive OPEN | | CLOSE \ \ + ------------ | | ---------- \ \ + create TCB | | delete TCB \ \ + V | \ \ + +---------+ CLOSE | \ + | LISTEN | ---------- | | + +---------+ delete TCB | | + rcv SYN | | SEND | | + ----------- | | ------- | V + +---------+ snd SYN,ACK / \ snd SYN +---------+ + | |<----------------- ------------------>| | + | SYN | rcv SYN | SYN | + | RCVD |<-----------------------------------------------| SENT | + | | snd ACK | | + | |------------------ -------------------| | + +---------+ rcv ACK of SYN \ / rcv SYN,ACK +---------+ + | -------------- | | ----------- + | x | | snd ACK + | V V + | CLOSE +---------+ + | ------- | ESTAB | + | snd FIN +---------+ + | CLOSE | | rcv FIN + V ------- | | ------- + +---------+ snd FIN / \ snd ACK +---------+ + | FIN |<----------------- ------------------>| CLOSE | + | WAIT-1 |------------------ | WAIT | + +---------+ rcv FIN \ +---------+ + | rcv ACK of FIN ------- | CLOSE | + | -------------- snd ACK | ------- | + V x V snd FIN V + +---------+ +---------+ +---------+ + |FINWAIT-2| | CLOSING | | LAST-ACK| + +---------+ +---------+ +---------+ + | rcv ACK of FIN | rcv ACK of FIN | + | rcv FIN -------------- | Timeout=2MSL -------------- | + | ------- x V ------------ x V + \ snd ACK +---------+delete TCB +---------+ + ------------------------>|TIME WAIT|------------------>| CLOSED | + +---------+ +---------+ + + TCP Connection State Diagram + Figure 6. + + + [Page 23] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + +3.3. Sequence Numbers + + A fundamental notion in the design is that every octet of data sent + over a TCP connection has a sequence number. Since every octet is + sequenced, each of them can be acknowledged. The acknowledgment + mechanism employed is cumulative so that an acknowledgment of sequence + number X indicates that all octets up to but not including X have been + received. This mechanism allows for straight-forward duplicate + detection in the presence of retransmission. Numbering of octets + within a segment is that the first data octet immediately following + the header is the lowest numbered, and the following octets are + numbered consecutively. + + It is essential to remember that the actual sequence number space is + finite, though very large. This space ranges from 0 to 2**32 - 1. + Since the space is finite, all arithmetic dealing with sequence + numbers must be performed modulo 2**32. This unsigned arithmetic + preserves the relationship of sequence numbers as they cycle from + 2**32 - 1 to 0 again. There are some subtleties to computer modulo + arithmetic, so great care should be taken in programming the + comparison of such values. The symbol "=<" means "less than or equal" + (modulo 2**32). + + The typical kinds of sequence number comparisons which the TCP must + perform include: + + (a) Determining that an acknowledgment refers to some sequence + number sent but not yet acknowledged. + + (b) Determining that all sequence numbers occupied by a segment + have been acknowledged (e.g., to remove the segment from a + retransmission queue). + + (c) Determining that an incoming segment contains sequence numbers + which are expected (i.e., that the segment "overlaps" the + receive window). + + + + + + + + + + + + + + +[Page 24] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + In response to sending data the TCP will receive acknowledgments. The + following comparisons are needed to process the acknowledgments. + + SND.UNA = oldest unacknowledged sequence number + + SND.NXT = next sequence number to be sent + + SEG.ACK = acknowledgment from the receiving TCP (next sequence + number expected by the receiving TCP) + + SEG.SEQ = first sequence number of a segment + + SEG.LEN = the number of octets occupied by the data in the segment + (counting SYN and FIN) + + SEG.SEQ+SEG.LEN-1 = last sequence number of a segment + + A new acknowledgment (called an "acceptable ack"), is one for which + the inequality below holds: + + SND.UNA < SEG.ACK =< SND.NXT + + A segment on the retransmission queue is fully acknowledged if the sum + of its sequence number and length is less or equal than the + acknowledgment value in the incoming segment. + + When data is received the following comparisons are needed: + + RCV.NXT = next sequence number expected on an incoming segments, and + is the left or lower edge of the receive window + + RCV.NXT+RCV.WND-1 = last sequence number expected on an incoming + segment, and is the right or upper edge of the receive window + + SEG.SEQ = first sequence number occupied by the incoming segment + + SEG.SEQ+SEG.LEN-1 = last sequence number occupied by the incoming + segment + + A segment is judged to occupy a portion of valid receive sequence + space if + + RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + + or + + RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + + + + [Page 25] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + The first part of this test checks to see if the beginning of the + segment falls in the window, the second part of the test checks to see + if the end of the segment falls in the window; if the segment passes + either part of the test it contains data in the window. + + Actually, it is a little more complicated than this. Due to zero + windows and zero length segments, we have four cases for the + acceptability of an incoming segment: + + Segment Receive Test + Length Window + ------- ------- ------------------------------------------- + + 0 0 SEG.SEQ = RCV.NXT + + 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + + >0 0 not acceptable + + >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + + Note that when the receive window is zero no segments should be + acceptable except ACK segments. Thus, it is be possible for a TCP to + maintain a zero receive window while transmitting data and receiving + ACKs. However, even when the receive window is zero, a TCP must + process the RST and URG fields of all incoming segments. + + We have taken advantage of the numbering scheme to protect certain + control information as well. This is achieved by implicitly including + some control flags in the sequence space so they can be retransmitted + and acknowledged without confusion (i.e., one and only one copy of the + control will be acted upon). Control information is not physically + carried in the segment data space. Consequently, we must adopt rules + for implicitly assigning sequence numbers to control. The SYN and FIN + are the only controls requiring this protection, and these controls + are used only at connection opening and closing. For sequence number + purposes, the SYN is considered to occur before the first actual data + octet of the segment in which it occurs, while the FIN is considered + to occur after the last actual data octet in a segment in which it + occurs. The segment length (SEG.LEN) includes both data and sequence + space occupying controls. When a SYN is present then SEG.SEQ is the + sequence number of the SYN. + + + + + + + +[Page 26] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + Initial Sequence Number Selection + + The protocol places no restriction on a particular connection being + used over and over again. A connection is defined by a pair of + sockets. New instances of a connection will be referred to as + incarnations of the connection. The problem that arises from this is + -- "how does the TCP identify duplicate segments from previous + incarnations of the connection?" This problem becomes apparent if the + connection is being opened and closed in quick succession, or if the + connection breaks with loss of memory and is then reestablished. + + To avoid confusion we must prevent segments from one incarnation of a + connection from being used while the same sequence numbers may still + be present in the network from an earlier incarnation. We want to + assure this, even if a TCP crashes and loses all knowledge of the + sequence numbers it has been using. When new connections are created, + an initial sequence number (ISN) generator is employed which selects a + new 32 bit ISN. The generator is bound to a (possibly fictitious) 32 + bit clock whose low order bit is incremented roughly every 4 + microseconds. Thus, the ISN cycles approximately every 4.55 hours. + Since we assume that segments will stay in the network no more than + the Maximum Segment Lifetime (MSL) and that the MSL is less than 4.55 + hours we can reasonably assume that ISN's will be unique. + + For each connection there is a send sequence number and a receive + sequence number. The initial send sequence number (ISS) is chosen by + the data sending TCP, and the initial receive sequence number (IRS) is + learned during the connection establishing procedure. + + For a connection to be established or initialized, the two TCPs must + synchronize on each other's initial sequence numbers. This is done in + an exchange of connection establishing segments carrying a control bit + called "SYN" (for synchronize) and the initial sequence numbers. As a + shorthand, segments carrying the SYN bit are also called "SYNs". + Hence, the solution requires a suitable mechanism for picking an + initial sequence number and a slightly involved handshake to exchange + the ISN's. + + The synchronization requires each side to send it's own initial + sequence number and to receive a confirmation of it in acknowledgment + from the other side. Each side must also receive the other side's + initial sequence number and send a confirming acknowledgment. + + 1) A --> B SYN my sequence number is X + 2) A <-- B ACK your sequence number is X + 3) A <-- B SYN my sequence number is Y + 4) A --> B ACK your sequence number is Y + + + + [Page 27] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + Because steps 2 and 3 can be combined in a single message this is + called the three way (or three message) handshake. + + A three way handshake is necessary because sequence numbers are not + tied to a global clock in the network, and TCPs may have different + mechanisms for picking the ISN's. The receiver of the first SYN has + no way of knowing whether the segment was an old delayed one or not, + unless it remembers the last sequence number used on the connection + (which is not always possible), and so it must ask the sender to + verify this SYN. The three way handshake and the advantages of a + clock-driven scheme are discussed in [3]. + + Knowing When to Keep Quiet + + To be sure that a TCP does not create a segment that carries a + sequence number which may be duplicated by an old segment remaining in + the network, the TCP must keep quiet for a maximum segment lifetime + (MSL) before assigning any sequence numbers upon starting up or + recovering from a crash in which memory of sequence numbers in use was + lost. For this specification the MSL is taken to be 2 minutes. This + is an engineering choice, and may be changed if experience indicates + it is desirable to do so. Note that if a TCP is reinitialized in some + sense, yet retains its memory of sequence numbers in use, then it need + not wait at all; it must only be sure to use sequence numbers larger + than those recently used. + + The TCP Quiet Time Concept + + This specification provides that hosts which "crash" without + retaining any knowledge of the last sequence numbers transmitted on + each active (i.e., not closed) connection shall delay emitting any + TCP segments for at least the agreed Maximum Segment Lifetime (MSL) + in the internet system of which the host is a part. In the + paragraphs below, an explanation for this specification is given. + TCP implementors may violate the "quiet time" restriction, but only + at the risk of causing some old data to be accepted as new or new + data rejected as old duplicated by some receivers in the internet + system. + + TCPs consume sequence number space each time a segment is formed and + entered into the network output queue at a source host. The + duplicate detection and sequencing algorithm in the TCP protocol + relies on the unique binding of segment data to sequence space to + the extent that sequence numbers will not cycle through all 2**32 + values before the segment data bound to those sequence numbers has + been delivered and acknowledged by the receiver and all duplicate + copies of the segments have "drained" from the internet. Without + such an assumption, two distinct TCP segments could conceivably be + + +[Page 28] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + assigned the same or overlapping sequence numbers, causing confusion + at the receiver as to which data is new and which is old. Remember + that each segment is bound to as many consecutive sequence numbers + as there are octets of data in the segment. + + Under normal conditions, TCPs keep track of the next sequence number + to emit and the oldest awaiting acknowledgment so as to avoid + mistakenly using a sequence number over before its first use has + been acknowledged. This alone does not guarantee that old duplicate + data is drained from the net, so the sequence space has been made + very large to reduce the probability that a wandering duplicate will + cause trouble upon arrival. At 2 megabits/sec. it takes 4.5 hours + to use up 2**32 octets of sequence space. Since the maximum segment + lifetime in the net is not likely to exceed a few tens of seconds, + this is deemed ample protection for foreseeable nets, even if data + rates escalate to l0's of megabits/sec. At 100 megabits/sec, the + cycle time is 5.4 minutes which may be a little short, but still + within reason. + + The basic duplicate detection and sequencing algorithm in TCP can be + defeated, however, if a source TCP does not have any memory of the + sequence numbers it last used on a given connection. For example, if + the TCP were to start all connections with sequence number 0, then + upon crashing and restarting, a TCP might re-form an earlier + connection (possibly after half-open connection resolution) and emit + packets with sequence numbers identical to or overlapping with + packets still in the network which were emitted on an earlier + incarnation of the same connection. In the absence of knowledge + about the sequence numbers used on a particular connection, the TCP + specification recommends that the source delay for MSL seconds + before emitting segments on the connection, to allow time for + segments from the earlier connection incarnation to drain from the + system. + + Even hosts which can remember the time of day and used it to select + initial sequence number values are not immune from this problem + (i.e., even if time of day is used to select an initial sequence + number for each new connection incarnation). + + Suppose, for example, that a connection is opened starting with + sequence number S. Suppose that this connection is not used much + and that eventually the initial sequence number function (ISN(t)) + takes on a value equal to the sequence number, say S1, of the last + segment sent by this TCP on a particular connection. Now suppose, + at this instant, the host crashes, recovers, and establishes a new + incarnation of the connection. The initial sequence number chosen is + S1 = ISN(t) -- last used sequence number on old incarnation of + connection! If the recovery occurs quickly enough, any old + + + [Page 29] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + duplicates in the net bearing sequence numbers in the neighborhood + of S1 may arrive and be treated as new packets by the receiver of + the new incarnation of the connection. + + The problem is that the recovering host may not know for how long it + crashed nor does it know whether there are still old duplicates in + the system from earlier connection incarnations. + + One way to deal with this problem is to deliberately delay emitting + segments for one MSL after recovery from a crash- this is the "quite + time" specification. Hosts which prefer to avoid waiting are + willing to risk possible confusion of old and new packets at a given + destination may choose not to wait for the "quite time". + Implementors may provide TCP users with the ability to select on a + connection by connection basis whether to wait after a crash, or may + informally implement the "quite time" for all connections. + Obviously, even where a user selects to "wait," this is not + necessary after the host has been "up" for at least MSL seconds. + + To summarize: every segment emitted occupies one or more sequence + numbers in the sequence space, the numbers occupied by a segment are + "busy" or "in use" until MSL seconds have passed, upon crashing a + block of space-time is occupied by the octets of the last emitted + segment, if a new connection is started too soon and uses any of the + sequence numbers in the space-time footprint of the last segment of + the previous connection incarnation, there is a potential sequence + number overlap area which could cause confusion at the receiver. + +3.4. Establishing a connection + + The "three-way handshake" is the procedure used to establish a + connection. This procedure normally is initiated by one TCP and + responded to by another TCP. The procedure also works if two TCP + simultaneously initiate the procedure. When simultaneous attempt + occurs, each TCP receives a "SYN" segment which carries no + acknowledgment after it has sent a "SYN". Of course, the arrival of + an old duplicate "SYN" segment can potentially make it appear, to the + recipient, that a simultaneous connection initiation is in progress. + Proper use of "reset" segments can disambiguate these cases. + + Several examples of connection initiation follow. Although these + examples do not show connection synchronization using data-carrying + segments, this is perfectly legitimate, so long as the receiving TCP + doesn't deliver the data to the user until it is clear the data is + valid (i.e., the data must be buffered at the receiver until the + connection reaches the ESTABLISHED state). The three-way handshake + reduces the possibility of false connections. It is the + + + +[Page 30] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + implementation of a trade-off between memory and messages to provide + information for this checking. + + The simplest three-way handshake is shown in figure 7 below. The + figures should be interpreted in the following way. Each line is + numbered for reference purposes. Right arrows (-->) indicate + departure of a TCP segment from TCP A to TCP B, or arrival of a + segment at B from A. Left arrows (<--), indicate the reverse. + Ellipsis (...) indicates a segment which is still in the network + (delayed). An "XXX" indicates a segment which is lost or rejected. + Comments appear in parentheses. TCP states represent the state AFTER + the departure or arrival of the segment (whose contents are shown in + the center of each line). Segment contents are shown in abbreviated + form, with sequence number, control flags, and ACK field. Other + fields such as window, addresses, lengths, and text have been left out + in the interest of clarity. + + + + TCP A TCP B + + 1. CLOSED LISTEN + + 2. SYN-SENT --> --> SYN-RECEIVED + + 3. ESTABLISHED <-- <-- SYN-RECEIVED + + 4. ESTABLISHED --> --> ESTABLISHED + + 5. ESTABLISHED --> --> ESTABLISHED + + Basic 3-Way Handshake for Connection Synchronization + + Figure 7. + + In line 2 of figure 7, TCP A begins by sending a SYN segment + indicating that it will use sequence numbers starting with sequence + number 100. In line 3, TCP B sends a SYN and acknowledges the SYN it + received from TCP A. Note that the acknowledgment field indicates TCP + B is now expecting to hear sequence 101, acknowledging the SYN which + occupied sequence 100. + + At line 4, TCP A responds with an empty segment containing an ACK for + TCP B's SYN; and in line 5, TCP A sends some data. Note that the + sequence number of the segment in line 5 is the same as in line 4 + because the ACK does not occupy sequence number space (if it did, we + would wind up ACKing ACK's!). + + + + [Page 31] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + Simultaneous initiation is only slightly more complex, as is shown in + figure 8. Each TCP cycles from CLOSED to SYN-SENT to SYN-RECEIVED to + ESTABLISHED. + + + + TCP A TCP B + + 1. CLOSED CLOSED + + 2. SYN-SENT --> ... + + 3. SYN-RECEIVED <-- <-- SYN-SENT + + 4. ... --> SYN-RECEIVED + + 5. SYN-RECEIVED --> ... + + 6. ESTABLISHED <-- <-- SYN-RECEIVED + + 7. ... --> ESTABLISHED + + Simultaneous Connection Synchronization + + Figure 8. + + The principle reason for the three-way handshake is to prevent old + duplicate connection initiations from causing confusion. To deal with + this, a special control message, reset, has been devised. If the + receiving TCP is in a non-synchronized state (i.e., SYN-SENT, + SYN-RECEIVED), it returns to LISTEN on receiving an acceptable reset. + If the TCP is in one of the synchronized states (ESTABLISHED, + FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT), it + aborts the connection and informs its user. We discuss this latter + case under "half-open" connections below. + + + + + + + + + + + + + + + +[Page 32] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + + + TCP A TCP B + + 1. CLOSED LISTEN + + 2. SYN-SENT --> ... + + 3. (duplicate) ... --> SYN-RECEIVED + + 4. SYN-SENT <-- <-- SYN-RECEIVED + + 5. SYN-SENT --> --> LISTEN + + + 6. ... --> SYN-RECEIVED + + 7. SYN-SENT <-- <-- SYN-RECEIVED + + 8. ESTABLISHED --> --> ESTABLISHED + + Recovery from Old Duplicate SYN + + Figure 9. + + As a simple example of recovery from old duplicates, consider + figure 9. At line 3, an old duplicate SYN arrives at TCP B. TCP B + cannot tell that this is an old duplicate, so it responds normally + (line 4). TCP A detects that the ACK field is incorrect and returns a + RST (reset) with its SEQ field selected to make the segment + believable. TCP B, on receiving the RST, returns to the LISTEN state. + When the original SYN (pun intended) finally arrives at line 6, the + synchronization proceeds normally. If the SYN at line 6 had arrived + before the RST, a more complex exchange might have occurred with RST's + sent in both directions. + + Half-Open Connections and Other Anomalies + + An established connection is said to be "half-open" if one of the + TCPs has closed or aborted the connection at its end without the + knowledge of the other, or if the two ends of the connection have + become desynchronized owing to a crash that resulted in loss of + memory. Such connections will automatically become reset if an + attempt is made to send data in either direction. However, half-open + connections are expected to be unusual, and the recovery procedure is + mildly involved. + + If at site A the connection no longer exists, then an attempt by the + + + [Page 33] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + user at site B to send any data on it will result in the site B TCP + receiving a reset control message. Such a message indicates to the + site B TCP that something is wrong, and it is expected to abort the + connection. + + Assume that two user processes A and B are communicating with one + another when a crash occurs causing loss of memory to A's TCP. + Depending on the operating system supporting A's TCP, it is likely + that some error recovery mechanism exists. When the TCP is up again, + A is likely to start again from the beginning or from a recovery + point. As a result, A will probably try to OPEN the connection again + or try to SEND on the connection it believes open. In the latter + case, it receives the error message "connection not open" from the + local (A's) TCP. In an attempt to establish the connection, A's TCP + will send a segment containing SYN. This scenario leads to the + example shown in figure 10. After TCP A crashes, the user attempts to + re-open the connection. TCP B, in the meantime, thinks the connection + is open. + + + + TCP A TCP B + + 1. (CRASH) (send 300,receive 100) + + 2. CLOSED ESTABLISHED + + 3. SYN-SENT --> --> (??) + + 4. (!!) <-- <-- ESTABLISHED + + 5. SYN-SENT --> --> (Abort!!) + + 6. SYN-SENT CLOSED + + 7. SYN-SENT --> --> + + Half-Open Connection Discovery + + Figure 10. + + When the SYN arrives at line 3, TCP B, being in a synchronized state, + and the incoming segment outside the window, responds with an + acknowledgment indicating what sequence it next expects to hear (ACK + 100). TCP A sees that this segment does not acknowledge anything it + sent and, being unsynchronized, sends a reset (RST) because it has + detected a half-open connection. TCP B aborts at line 5. TCP A will + + + +[Page 34] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + continue to try to establish the connection; the problem is now + reduced to the basic 3-way handshake of figure 7. + + An interesting alternative case occurs when TCP A crashes and TCP B + tries to send data on what it thinks is a synchronized connection. + This is illustrated in figure 11. In this case, the data arriving at + TCP A from TCP B (line 2) is unacceptable because no such connection + exists, so TCP A sends a RST. The RST is acceptable so TCP B + processes it and aborts the connection. + + + + TCP A TCP B + + 1. (CRASH) (send 300,receive 100) + + 2. (??) <-- <-- ESTABLISHED + + 3. --> --> (ABORT!!) + + Active Side Causes Half-Open Connection Discovery + + Figure 11. + + In figure 12, we find the two TCPs A and B with passive connections + waiting for SYN. An old duplicate arriving at TCP B (line 2) stirs B + into action. A SYN-ACK is returned (line 3) and causes TCP A to + generate a RST (the ACK in line 3 is not acceptable). TCP B accepts + the reset and returns to its passive LISTEN state. + + + + TCP A TCP B + + 1. LISTEN LISTEN + + 2. ... --> SYN-RECEIVED + + 3. (??) <-- <-- SYN-RECEIVED + + 4. --> --> (return to LISTEN!) + + 5. LISTEN LISTEN + + Old Duplicate SYN Initiates a Reset on two Passive Sockets + + Figure 12. + + + + [Page 35] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + A variety of other cases are possible, all of which are accounted for + by the following rules for RST generation and processing. + + Reset Generation + + As a general rule, reset (RST) must be sent whenever a segment arrives + which apparently is not intended for the current connection. A reset + must not be sent if it is not clear that this is the case. + + There are three groups of states: + + 1. If the connection does not exist (CLOSED) then a reset is sent + in response to any incoming segment except another reset. In + particular, SYNs addressed to a non-existent connection are rejected + by this means. + + If the incoming segment has an ACK field, the reset takes its + sequence number from the ACK field of the segment, otherwise the + reset has sequence number zero and the ACK field is set to the sum + of the sequence number and segment length of the incoming segment. + The connection remains in the CLOSED state. + + 2. If the connection is in any non-synchronized state (LISTEN, + SYN-SENT, SYN-RECEIVED), and the incoming segment acknowledges + something not yet sent (the segment carries an unacceptable ACK), or + if an incoming segment has a security level or compartment which + does not exactly match the level and compartment requested for the + connection, a reset is sent. + + If our SYN has not been acknowledged and the precedence level of the + incoming segment is higher than the precedence level requested then + either raise the local precedence level (if allowed by the user and + the system) or send a reset; or if the precedence level of the + incoming segment is lower than the precedence level requested then + continue as if the precedence matched exactly (if the remote TCP + cannot raise the precedence level to match ours this will be + detected in the next segment it sends, and the connection will be + terminated then). If our SYN has been acknowledged (perhaps in this + incoming segment) the precedence level of the incoming segment must + match the local precedence level exactly, if it does not a reset + must be sent. + + If the incoming segment has an ACK field, the reset takes its + sequence number from the ACK field of the segment, otherwise the + reset has sequence number zero and the ACK field is set to the sum + of the sequence number and segment length of the incoming segment. + The connection remains in the same state. + + + +[Page 36] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + 3. If the connection is in a synchronized state (ESTABLISHED, + FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT), + any unacceptable segment (out of window sequence number or + unacceptible acknowledgment number) must elicit only an empty + acknowledgment segment containing the current send-sequence number + and an acknowledgment indicating the next sequence number expected + to be received, and the connection remains in the same state. + + If an incoming segment has a security level, or compartment, or + precedence which does not exactly match the level, and compartment, + and precedence requested for the connection,a reset is sent and + connection goes to the CLOSED state. The reset takes its sequence + number from the ACK field of the incoming segment. + + Reset Processing + + In all states except SYN-SENT, all reset (RST) segments are validated + by checking their SEQ-fields. A reset is valid if its sequence number + is in the window. In the SYN-SENT state (a RST received in response + to an initial SYN), the RST is acceptable if the ACK field + acknowledges the SYN. + + The receiver of a RST first validates it, then changes state. If the + receiver was in the LISTEN state, it ignores it. If the receiver was + in SYN-RECEIVED state and had previously been in the LISTEN state, + then the receiver returns to the LISTEN state, otherwise the receiver + aborts the connection and goes to the CLOSED state. If the receiver + was in any other state, it aborts the connection and advises the user + and goes to the CLOSED state. + +3.5. Closing a Connection + + CLOSE is an operation meaning "I have no more data to send." The + notion of closing a full-duplex connection is subject to ambiguous + interpretation, of course, since it may not be obvious how to treat + the receiving side of the connection. We have chosen to treat CLOSE + in a simplex fashion. The user who CLOSEs may continue to RECEIVE + until he is told that the other side has CLOSED also. Thus, a program + could initiate several SENDs followed by a CLOSE, and then continue to + RECEIVE until signaled that a RECEIVE failed because the other side + has CLOSED. We assume that the TCP will signal a user, even if no + RECEIVEs are outstanding, that the other side has closed, so the user + can terminate his side gracefully. A TCP will reliably deliver all + buffers SENT before the connection was CLOSED so a user who expects no + data in return need only wait to hear the connection was CLOSED + successfully to know that all his data was received at the destination + TCP. Users must keep reading connections they close for sending until + the TCP says no more data. + + + [Page 37] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + There are essentially three cases: + + 1) The user initiates by telling the TCP to CLOSE the connection + + 2) The remote TCP initiates by sending a FIN control signal + + 3) Both users CLOSE simultaneously + + Case 1: Local user initiates the close + + In this case, a FIN segment can be constructed and placed on the + outgoing segment queue. No further SENDs from the user will be + accepted by the TCP, and it enters the FIN-WAIT-1 state. RECEIVEs + are allowed in this state. All segments preceding and including FIN + will be retransmitted until acknowledged. When the other TCP has + both acknowledged the FIN and sent a FIN of its own, the first TCP + can ACK this FIN. Note that a TCP receiving a FIN will ACK but not + send its own FIN until its user has CLOSED the connection also. + + Case 2: TCP receives a FIN from the network + + If an unsolicited FIN arrives from the network, the receiving TCP + can ACK it and tell the user that the connection is closing. The + user will respond with a CLOSE, upon which the TCP can send a FIN to + the other TCP after sending any remaining data. The TCP then waits + until its own FIN is acknowledged whereupon it deletes the + connection. If an ACK is not forthcoming, after the user timeout + the connection is aborted and the user is told. + + Case 3: both users close simultaneously + + A simultaneous CLOSE by users at both ends of a connection causes + FIN segments to be exchanged. When all segments preceding the FINs + have been processed and acknowledged, each TCP can ACK the FIN it + has received. Both will, upon receiving these ACKs, delete the + connection. + + + + + + + + + + + + + + +[Page 38] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + + + TCP A TCP B + + 1. ESTABLISHED ESTABLISHED + + 2. (Close) + FIN-WAIT-1 --> --> CLOSE-WAIT + + 3. FIN-WAIT-2 <-- <-- CLOSE-WAIT + + 4. (Close) + TIME-WAIT <-- <-- LAST-ACK + + 5. TIME-WAIT --> --> CLOSED + + 6. (2 MSL) + CLOSED + + Normal Close Sequence + + Figure 13. + + + + TCP A TCP B + + 1. ESTABLISHED ESTABLISHED + + 2. (Close) (Close) + FIN-WAIT-1 --> ... FIN-WAIT-1 + <-- <-- + ... --> + + 3. CLOSING --> ... CLOSING + <-- <-- + ... --> + + 4. TIME-WAIT TIME-WAIT + (2 MSL) (2 MSL) + CLOSED CLOSED + + Simultaneous Close Sequence + + Figure 14. + + + + + + [Page 39] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + +3.6. Precedence and Security + + The intent is that connection be allowed only between ports operating + with exactly the same security and compartment values and at the + higher of the precedence level requested by the two ports. + + The precedence and security parameters used in TCP are exactly those + defined in the Internet Protocol (IP) [2]. Throughout this TCP + specification the term "security/compartment" is intended to indicate + the security parameters used in IP including security, compartment, + user group, and handling restriction. + + A connection attempt with mismatched security/compartment values or a + lower precedence value must be rejected by sending a reset. Rejecting + a connection due to too low a precedence only occurs after an + acknowledgment of the SYN has been received. + + Note that TCP modules which operate only at the default value of + precedence will still have to check the precedence of incoming + segments and possibly raise the precedence level they use on the + connection. + + The security paramaters may be used even in a non-secure environment + (the values would indicate unclassified data), thus hosts in + non-secure environments must be prepared to receive the security + parameters, though they need not send them. + +3.7. Data Communication + + Once the connection is established data is communicated by the + exchange of segments. Because segments may be lost due to errors + (checksum test failure), or network congestion, TCP uses + retransmission (after a timeout) to ensure delivery of every segment. + Duplicate segments may arrive due to network or TCP retransmission. + As discussed in the section on sequence numbers the TCP performs + certain tests on the sequence and acknowledgment numbers in the + segments to verify their acceptability. + + The sender of data keeps track of the next sequence number to use in + the variable SND.NXT. The receiver of data keeps track of the next + sequence number to expect in the variable RCV.NXT. The sender of data + keeps track of the oldest unacknowledged sequence number in the + variable SND.UNA. If the data flow is momentarily idle and all data + sent has been acknowledged then the three variables will be equal. + + When the sender creates a segment and transmits it the sender advances + SND.NXT. When the receiver accepts a segment it advances RCV.NXT and + sends an acknowledgment. When the data sender receives an + + +[Page 40] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + acknowledgment it advances SND.UNA. The extent to which the values of + these variables differ is a measure of the delay in the communication. + The amount by which the variables are advanced is the length of the + data in the segment. Note that once in the ESTABLISHED state all + segments must carry current acknowledgment information. + + The CLOSE user call implies a push function, as does the FIN control + flag in an incoming segment. + + Retransmission Timeout + + Because of the variability of the networks that compose an + internetwork system and the wide range of uses of TCP connections the + retransmission timeout must be dynamically determined. One procedure + for determining a retransmission time out is given here as an + illustration. + + An Example Retransmission Timeout Procedure + + Measure the elapsed time between sending a data octet with a + particular sequence number and receiving an acknowledgment that + covers that sequence number (segments sent do not have to match + segments received). This measured elapsed time is the Round Trip + Time (RTT). Next compute a Smoothed Round Trip Time (SRTT) as: + + SRTT = ( ALPHA * SRTT ) + ((1-ALPHA) * RTT) + + and based on this, compute the retransmission timeout (RTO) as: + + RTO = min[UBOUND,max[LBOUND,(BETA*SRTT)]] + + where UBOUND is an upper bound on the timeout (e.g., 1 minute), + LBOUND is a lower bound on the timeout (e.g., 1 second), ALPHA is + a smoothing factor (e.g., .8 to .9), and BETA is a delay variance + factor (e.g., 1.3 to 2.0). + + The Communication of Urgent Information + + The objective of the TCP urgent mechanism is to allow the sending user + to stimulate the receiving user to accept some urgent data and to + permit the receiving TCP to indicate to the receiving user when all + the currently known urgent data has been received by the user. + + This mechanism permits a point in the data stream to be designated as + the end of urgent information. Whenever this point is in advance of + the receive sequence number (RCV.NXT) at the receiving TCP, that TCP + must tell the user to go into "urgent mode"; when the receive sequence + number catches up to the urgent pointer, the TCP must tell user to go + + + [Page 41] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + into "normal mode". If the urgent pointer is updated while the user + is in "urgent mode", the update will be invisible to the user. + + The method employs a urgent field which is carried in all segments + transmitted. The URG control flag indicates that the urgent field is + meaningful and must be added to the segment sequence number to yield + the urgent pointer. The absence of this flag indicates that there is + no urgent data outstanding. + + To send an urgent indication the user must also send at least one data + octet. If the sending user also indicates a push, timely delivery of + the urgent information to the destination process is enhanced. + + Managing the Window + + The window sent in each segment indicates the range of sequence + numbers the sender of the window (the data receiver) is currently + prepared to accept. There is an assumption that this is related to + the currently available data buffer space available for this + connection. + + Indicating a large window encourages transmissions. If more data + arrives than can be accepted, it will be discarded. This will result + in excessive retransmissions, adding unnecessarily to the load on the + network and the TCPs. Indicating a small window may restrict the + transmission of data to the point of introducing a round trip delay + between each new segment transmitted. + + The mechanisms provided allow a TCP to advertise a large window and to + subsequently advertise a much smaller window without having accepted + that much data. This, so called "shrinking the window," is strongly + discouraged. The robustness principle dictates that TCPs will not + shrink the window themselves, but will be prepared for such behavior + on the part of other TCPs. + + The sending TCP must be prepared to accept from the user and send at + least one octet of new data even if the send window is zero. The + sending TCP must regularly retransmit to the receiving TCP even when + the window is zero. Two minutes is recommended for the retransmission + interval when the window is zero. This retransmission is essential to + guarantee that when either TCP has a zero window the re-opening of the + window will be reliably reported to the other. + + When the receiving TCP has a zero window and a segment arrives it must + still send an acknowledgment showing its next expected sequence number + and current window (zero). + + The sending TCP packages the data to be transmitted into segments + + +[Page 42] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + which fit the current window, and may repackage segments on the + retransmission queue. Such repackaging is not required, but may be + helpful. + + In a connection with a one-way data flow, the window information will + be carried in acknowledgment segments that all have the same sequence + number so there will be no way to reorder them if they arrive out of + order. This is not a serious problem, but it will allow the window + information to be on occasion temporarily based on old reports from + the data receiver. A refinement to avoid this problem is to act on + the window information from segments that carry the highest + acknowledgment number (that is segments with acknowledgment number + equal or greater than the highest previously received). + + The window management procedure has significant influence on the + communication performance. The following comments are suggestions to + implementers. + + Window Management Suggestions + + Allocating a very small window causes data to be transmitted in + many small segments when better performance is achieved using + fewer large segments. + + One suggestion for avoiding small windows is for the receiver to + defer updating a window until the additional allocation is at + least X percent of the maximum allocation possible for the + connection (where X might be 20 to 40). + + Another suggestion is for the sender to avoid sending small + segments by waiting until the window is large enough before + sending data. If the the user signals a push function then the + data must be sent even if it is a small segment. + + Note that the acknowledgments should not be delayed or unnecessary + retransmissions will result. One strategy would be to send an + acknowledgment when a small segment arrives (with out updating the + window information), and then to send another acknowledgment with + new window information when the window is larger. + + The segment sent to probe a zero window may also begin a break up + of transmitted data into smaller and smaller segments. If a + segment containing a single data octet sent to probe a zero window + is accepted, it consumes one octet of the window now available. + If the sending TCP simply sends as much as it can whenever the + window is non zero, the transmitted data will be broken into + alternating big and small segments. As time goes on, occasional + pauses in the receiver making window allocation available will + + + [Page 43] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + result in breaking the big segments into a small and not quite so + big pair. And after a while the data transmission will be in + mostly small segments. + + The suggestion here is that the TCP implementations need to + actively attempt to combine small window allocations into larger + windows, since the mechanisms for managing the window tend to lead + to many small windows in the simplest minded implementations. + +3.8. Interfaces + + There are of course two interfaces of concern: the user/TCP interface + and the TCP/lower-level interface. We have a fairly elaborate model + of the user/TCP interface, but the interface to the lower level + protocol module is left unspecified here, since it will be specified + in detail by the specification of the lowel level protocol. For the + case that the lower level is IP we note some of the parameter values + that TCPs might use. + + User/TCP Interface + + The following functional description of user commands to the TCP is, + at best, fictional, since every operating system will have different + facilities. Consequently, we must warn readers that different TCP + implementations may have different user interfaces. However, all + TCPs must provide a certain minimum set of services to guarantee + that all TCP implementations can support the same protocol + hierarchy. This section specifies the functional interfaces + required of all TCP implementations. + + TCP User Commands + + The following sections functionally characterize a USER/TCP + interface. The notation used is similar to most procedure or + function calls in high level languages, but this usage is not + meant to rule out trap type service calls (e.g., SVCs, UUOs, + EMTs). + + The user commands described below specify the basic functions the + TCP must perform to support interprocess communication. + Individual implementations must define their own exact format, and + may provide combinations or subsets of the basic functions in + single calls. In particular, some implementations may wish to + automatically OPEN a connection on the first SEND or RECEIVE + issued by the user for a given connection. + + + + + +[Page 44] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + In providing interprocess communication facilities, the TCP must + not only accept commands, but must also return information to the + processes it serves. The latter consists of: + + (a) general information about a connection (e.g., interrupts, + remote close, binding of unspecified foreign socket). + + (b) replies to specific user commands indicating success or + various types of failure. + + Open + + Format: OPEN (local port, foreign socket, active/passive + [, timeout] [, precedence] [, security/compartment] [, options]) + -> local connection name + + We assume that the local TCP is aware of the identity of the + processes it serves and will check the authority of the process + to use the connection specified. Depending upon the + implementation of the TCP, the local network and TCP identifiers + for the source address will either be supplied by the TCP or the + lower level protocol (e.g., IP). These considerations are the + result of concern about security, to the extent that no TCP be + able to masquerade as another one, and so on. Similarly, no + process can masquerade as another without the collusion of the + TCP. + + If the active/passive flag is set to passive, then this is a + call to LISTEN for an incoming connection. A passive open may + have either a fully specified foreign socket to wait for a + particular connection or an unspecified foreign socket to wait + for any call. A fully specified passive call can be made active + by the subsequent execution of a SEND. + + A transmission control block (TCB) is created and partially + filled in with data from the OPEN command parameters. + + On an active OPEN command, the TCP will begin the procedure to + synchronize (i.e., establish) the connection at once. + + The timeout, if present, permits the caller to set up a timeout + for all data submitted to TCP. If data is not successfully + delivered to the destination within the timeout period, the TCP + will abort the connection. The present global default is five + minutes. + + The TCP or some component of the operating system will verify + the users authority to open a connection with the specified + + + [Page 45] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + precedence or security/compartment. The absence of precedence + or security/compartment specification in the OPEN call indicates + the default values must be used. + + TCP will accept incoming requests as matching only if the + security/compartment information is exactly the same and only if + the precedence is equal to or higher than the precedence + requested in the OPEN call. + + The precedence for the connection is the higher of the values + requested in the OPEN call and received from the incoming + request, and fixed at that value for the life of the + connection.Implementers may want to give the user control of + this precedence negotiation. For example, the user might be + allowed to specify that the precedence must be exactly matched, + or that any attempt to raise the precedence be confirmed by the + user. + + A local connection name will be returned to the user by the TCP. + The local connection name can then be used as a short hand term + for the connection defined by the + pair. + + Send + + Format: SEND (local connection name, buffer address, byte + count, PUSH flag, URGENT flag [,timeout]) + + This call causes the data contained in the indicated user buffer + to be sent on the indicated connection. If the connection has + not been opened, the SEND is considered an error. Some + implementations may allow users to SEND first; in which case, an + automatic OPEN would be done. If the calling process is not + authorized to use this connection, an error is returned. + + If the PUSH flag is set, the data must be transmitted promptly + to the receiver, and the PUSH bit will be set in the last TCP + segment created from the buffer. If the PUSH flag is not set, + the data may be combined with data from subsequent SENDs for + transmission efficiency. + + If the URGENT flag is set, segments sent to the destination TCP + will have the urgent pointer set. The receiving TCP will signal + the urgent condition to the receiving process if the urgent + pointer indicates that data preceding the urgent pointer has not + been consumed by the receiving process. The purpose of urgent + is to stimulate the receiver to process the urgent data and to + indicate to the receiver when all the currently known urgent + + +[Page 46] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + data has been received. The number of times the sending user's + TCP signals urgent will not necessarily be equal to the number + of times the receiving user will be notified of the presence of + urgent data. + + If no foreign socket was specified in the OPEN, but the + connection is established (e.g., because a LISTENing connection + has become specific due to a foreign segment arriving for the + local socket), then the designated buffer is sent to the implied + foreign socket. Users who make use of OPEN with an unspecified + foreign socket can make use of SEND without ever explicitly + knowing the foreign socket address. + + However, if a SEND is attempted before the foreign socket + becomes specified, an error will be returned. Users can use the + STATUS call to determine the status of the connection. In some + implementations the TCP may notify the user when an unspecified + socket is bound. + + If a timeout is specified, the current user timeout for this + connection is changed to the new one. + + In the simplest implementation, SEND would not return control to + the sending process until either the transmission was complete + or the timeout had been exceeded. However, this simple method + is both subject to deadlocks (for example, both sides of the + connection might try to do SENDs before doing any RECEIVEs) and + offers poor performance, so it is not recommended. A more + sophisticated implementation would return immediately to allow + the process to run concurrently with network I/O, and, + furthermore, to allow multiple SENDs to be in progress. + Multiple SENDs are served in first come, first served order, so + the TCP will queue those it cannot service immediately. + + We have implicitly assumed an asynchronous user interface in + which a SEND later elicits some kind of SIGNAL or + pseudo-interrupt from the serving TCP. An alternative is to + return a response immediately. For instance, SENDs might return + immediate local acknowledgment, even if the segment sent had not + been acknowledged by the distant TCP. We could optimistically + assume eventual success. If we are wrong, the connection will + close anyway due to the timeout. In implementations of this + kind (synchronous), there will still be some asynchronous + signals, but these will deal with the connection itself, and not + with specific segments or buffers. + + In order for the process to distinguish among error or success + indications for different SENDs, it might be appropriate for the + + + [Page 47] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + buffer address to be returned along with the coded response to + the SEND request. TCP-to-user signals are discussed below, + indicating the information which should be returned to the + calling process. + + Receive + + Format: RECEIVE (local connection name, buffer address, byte + count) -> byte count, urgent flag, push flag + + This command allocates a receiving buffer associated with the + specified connection. If no OPEN precedes this command or the + calling process is not authorized to use this connection, an + error is returned. + + In the simplest implementation, control would not return to the + calling program until either the buffer was filled, or some + error occurred, but this scheme is highly subject to deadlocks. + A more sophisticated implementation would permit several + RECEIVEs to be outstanding at once. These would be filled as + segments arrive. This strategy permits increased throughput at + the cost of a more elaborate scheme (possibly asynchronous) to + notify the calling program that a PUSH has been seen or a buffer + filled. + + If enough data arrive to fill the buffer before a PUSH is seen, + the PUSH flag will not be set in the response to the RECEIVE. + The buffer will be filled with as much data as it can hold. If + a PUSH is seen before the buffer is filled the buffer will be + returned partially filled and PUSH indicated. + + If there is urgent data the user will have been informed as soon + as it arrived via a TCP-to-user signal. The receiving user + should thus be in "urgent mode". If the URGENT flag is on, + additional urgent data remains. If the URGENT flag is off, this + call to RECEIVE has returned all the urgent data, and the user + may now leave "urgent mode". Note that data following the + urgent pointer (non-urgent data) cannot be delivered to the user + in the same buffer with preceeding urgent data unless the + boundary is clearly marked for the user. + + To distinguish among several outstanding RECEIVEs and to take + care of the case that a buffer is not completely filled, the + return code is accompanied by both a buffer pointer and a byte + count indicating the actual length of the data received. + + Alternative implementations of RECEIVE might have the TCP + + + +[Page 48] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + allocate buffer storage, or the TCP might share a ring buffer + with the user. + + Close + + Format: CLOSE (local connection name) + + This command causes the connection specified to be closed. If + the connection is not open or the calling process is not + authorized to use this connection, an error is returned. + Closing connections is intended to be a graceful operation in + the sense that outstanding SENDs will be transmitted (and + retransmitted), as flow control permits, until all have been + serviced. Thus, it should be acceptable to make several SEND + calls, followed by a CLOSE, and expect all the data to be sent + to the destination. It should also be clear that users should + continue to RECEIVE on CLOSING connections, since the other side + may be trying to transmit the last of its data. Thus, CLOSE + means "I have no more to send" but does not mean "I will not + receive any more." It may happen (if the user level protocol is + not well thought out) that the closing side is unable to get rid + of all its data before timing out. In this event, CLOSE turns + into ABORT, and the closing TCP gives up. + + The user may CLOSE the connection at any time on his own + initiative, or in response to various prompts from the TCP + (e.g., remote close executed, transmission timeout exceeded, + destination inaccessible). + + Because closing a connection requires communication with the + foreign TCP, connections may remain in the closing state for a + short time. Attempts to reopen the connection before the TCP + replies to the CLOSE command will result in error responses. + + Close also implies push function. + + Status + + Format: STATUS (local connection name) -> status data + + This is an implementation dependent user command and could be + excluded without adverse effect. Information returned would + typically come from the TCB associated with the connection. + + This command returns a data block containing the following + information: + + local socket, + + + [Page 49] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + + foreign socket, + local connection name, + receive window, + send window, + connection state, + number of buffers awaiting acknowledgment, + number of buffers pending receipt, + urgent state, + precedence, + security/compartment, + and transmission timeout. + + Depending on the state of the connection, or on the + implementation itself, some of this information may not be + available or meaningful. If the calling process is not + authorized to use this connection, an error is returned. This + prevents unauthorized processes from gaining information about a + connection. + + Abort + + Format: ABORT (local connection name) + + This command causes all pending SENDs and RECEIVES to be + aborted, the TCB to be removed, and a special RESET message to + be sent to the TCP on the other side of the connection. + Depending on the implementation, users may receive abort + indications for each outstanding SEND or RECEIVE, or may simply + receive an ABORT-acknowledgment. + + TCP-to-User Messages + + It is assumed that the operating system environment provides a + means for the TCP to asynchronously signal the user program. When + the TCP does signal a user program, certain information is passed + to the user. Often in the specification the information will be + an error message. In other cases there will be information + relating to the completion of processing a SEND or RECEIVE or + other user call. + + The following information is provided: + + Local Connection Name Always + Response String Always + Buffer Address Send & Receive + Byte count (counts bytes received) Receive + Push flag Receive + Urgent flag Receive + + +[Page 50] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + TCP/Lower-Level Interface + + The TCP calls on a lower level protocol module to actually send and + receive information over a network. One case is that of the ARPA + internetwork system where the lower level module is the Internet + Protocol (IP) [2]. + + If the lower level protocol is IP it provides arguments for a type + of service and for a time to live. TCP uses the following settings + for these parameters: + + Type of Service = Precedence: routine, Delay: normal, Throughput: + normal, Reliability: normal; or 00000000. + + Time to Live = one minute, or 00111100. + + Note that the assumed maximum segment lifetime is two minutes. + Here we explicitly ask that a segment be destroyed if it cannot + be delivered by the internet system within one minute. + + If the lower level is IP (or other protocol that provides this + feature) and source routing is used, the interface must allow the + route information to be communicated. This is especially important + so that the source and destination addresses used in the TCP + checksum be the originating source and ultimate destination. It is + also important to preserve the return route to answer connection + requests. + + Any lower level protocol will have to provide the source address, + destination address, and protocol fields, and some way to determine + the "TCP length", both to provide the functional equivlent service + of IP and to be used in the TCP checksum. + + + + + + + + + + + + + + + + + + + [Page 51] + + + September 1981 +Transmission Control Protocol +Functional Specification + + + +3.9. Event Processing + + The processing depicted in this section is an example of one possible + implementation. Other implementations may have slightly different + processing sequences, but they should differ from those in this + section only in detail, not in substance. + + The activity of the TCP can be characterized as responding to events. + The events that occur can be cast into three categories: user calls, + arriving segments, and timeouts. This section describes the + processing the TCP does in response to each of the events. In many + cases the processing required depends on the state of the connection. + + Events that occur: + + User Calls + + OPEN + SEND + RECEIVE + CLOSE + ABORT + STATUS + + Arriving Segments + + SEGMENT ARRIVES + + Timeouts + + USER TIMEOUT + RETRANSMISSION TIMEOUT + TIME-WAIT TIMEOUT + + The model of the TCP/user interface is that user commands receive an + immediate return and possibly a delayed response via an event or + pseudo interrupt. In the following descriptions, the term "signal" + means cause a delayed response. + + Error responses are given as character strings. For example, user + commands referencing connections that do not exist receive "error: + connection not open". + + Please note in the following that all arithmetic on sequence numbers, + acknowledgment numbers, windows, et cetera, is modulo 2**32 the size + of the sequence number space. Also note that "=<" means less than or + equal to (modulo 2**32). + + + +[Page 52] + + +September 1981 + Transmission Control Protocol + Functional Specification + + + + A natural way to think about processing incoming segments is to + imagine that they are first tested for proper sequence number (i.e., + that their contents lie in the range of the expected "receive window" + in the sequence number space) and then that they are generally queued + and processed in sequence number order. + + When a segment overlaps other already received segments we reconstruct + the segment to contain just the new data, and adjust the header fields + to be consistent. + + Note that if no state change is mentioned the TCP stays in the same + state. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page 53] + + + September 1981 +Transmission Control Protocol +Functional Specification + OPEN Call + + + + OPEN Call + + CLOSED STATE (i.e., TCB does not exist) + + Create a new transmission control block (TCB) to hold connection + state information. Fill in local socket identifier, foreign + socket, precedence, security/compartment, and user timeout + information. Note that some parts of the foreign socket may be + unspecified in a passive OPEN and are to be filled in by the + parameters of the incoming SYN segment. Verify the security and + precedence requested are allowed for this user, if not return + "error: precedence not allowed" or "error: security/compartment + not allowed." If passive enter the LISTEN state and return. If + active and the foreign socket is unspecified, return "error: + foreign socket unspecified"; if active and the foreign socket is + specified, issue a SYN segment. An initial send sequence number + (ISS) is selected. A SYN segment of the form + is sent. Set SND.UNA to ISS, SND.NXT to ISS+1, enter SYN-SENT + state, and return. + + If the caller does not have access to the local socket specified, + return "error: connection illegal for this process". If there is + no room to create a new connection, return "error: insufficient + resources". + + LISTEN STATE + + If active and the foreign socket is specified, then change the + connection from passive to active, select an ISS. Send a SYN + segment, set SND.UNA to ISS, SND.NXT to ISS+1. Enter SYN-SENT + state. Data associated with SEND may be sent with SYN segment or + queued for transmission after entering ESTABLISHED state. The + urgent bit if requested in the command must be sent with the data + segments sent as a result of this command. If there is no room to + queue the request, respond with "error: insufficient resources". + If Foreign socket was not specified, then return "error: foreign + socket unspecified". + + + + + + + + + + + + +[Page 54] + + +September 1981 + Transmission Control Protocol + Functional Specification +OPEN Call + + + + SYN-SENT STATE + SYN-RECEIVED STATE + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + CLOSE-WAIT STATE + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + Return "error: connection already exists". + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page 55] + + + September 1981 +Transmission Control Protocol +Functional Specification + SEND Call + + + + SEND Call + + CLOSED STATE (i.e., TCB does not exist) + + If the user does not have access to such a connection, then return + "error: connection illegal for this process". + + Otherwise, return "error: connection does not exist". + + LISTEN STATE + + If the foreign socket is specified, then change the connection + from passive to active, select an ISS. Send a SYN segment, set + SND.UNA to ISS, SND.NXT to ISS+1. Enter SYN-SENT state. Data + associated with SEND may be sent with SYN segment or queued for + transmission after entering ESTABLISHED state. The urgent bit if + requested in the command must be sent with the data segments sent + as a result of this command. If there is no room to queue the + request, respond with "error: insufficient resources". If + Foreign socket was not specified, then return "error: foreign + socket unspecified". + + SYN-SENT STATE + SYN-RECEIVED STATE + + Queue the data for transmission after entering ESTABLISHED state. + If no space to queue, respond with "error: insufficient + resources". + + ESTABLISHED STATE + CLOSE-WAIT STATE + + Segmentize the buffer and send it with a piggybacked + acknowledgment (acknowledgment value = RCV.NXT). If there is + insufficient space to remember this buffer, simply return "error: + insufficient resources". + + If the urgent flag is set, then SND.UP <- SND.NXT-1 and set the + urgent pointer in the outgoing segments. + + + + + + + + + + +[Page 56] + + +September 1981 + Transmission Control Protocol + Functional Specification +SEND Call + + + + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + Return "error: connection closing" and do not service request. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page 57] + + + September 1981 +Transmission Control Protocol +Functional Specification + RECEIVE Call + + + + RECEIVE Call + + CLOSED STATE (i.e., TCB does not exist) + + If the user does not have access to such a connection, return + "error: connection illegal for this process". + + Otherwise return "error: connection does not exist". + + LISTEN STATE + SYN-SENT STATE + SYN-RECEIVED STATE + + Queue for processing after entering ESTABLISHED state. If there + is no room to queue this request, respond with "error: + insufficient resources". + + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + + If insufficient incoming segments are queued to satisfy the + request, queue the request. If there is no queue space to + remember the RECEIVE, respond with "error: insufficient + resources". + + Reassemble queued incoming segments into receive buffer and return + to user. Mark "push seen" (PUSH) if this is the case. + + If RCV.UP is in advance of the data currently being passed to the + user notify the user of the presence of urgent data. + + When the TCP takes responsibility for delivering data to the user + that fact must be communicated to the sender via an + acknowledgment. The formation of such an acknowledgment is + described below in the discussion of processing an incoming + segment. + + + + + + + + + + + + +[Page 58] + + +September 1981 + Transmission Control Protocol + Functional Specification +RECEIVE Call + + + + CLOSE-WAIT STATE + + Since the remote side has already sent FIN, RECEIVEs must be + satisfied by text already on hand, but not yet delivered to the + user. If no text is awaiting delivery, the RECEIVE will get a + "error: connection closing" response. Otherwise, any remaining + text can be used to satisfy the RECEIVE. + + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + Return "error: connection closing". + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page 59] + + + September 1981 +Transmission Control Protocol +Functional Specification + CLOSE Call + + + + CLOSE Call + + CLOSED STATE (i.e., TCB does not exist) + + If the user does not have access to such a connection, return + "error: connection illegal for this process". + + Otherwise, return "error: connection does not exist". + + LISTEN STATE + + Any outstanding RECEIVEs are returned with "error: closing" + responses. Delete TCB, enter CLOSED state, and return. + + SYN-SENT STATE + + Delete the TCB and return "error: closing" responses to any + queued SENDs, or RECEIVEs. + + SYN-RECEIVED STATE + + If no SENDs have been issued and there is no pending data to send, + then form a FIN segment and send it, and enter FIN-WAIT-1 state; + otherwise queue for processing after entering ESTABLISHED state. + + ESTABLISHED STATE + + Queue this until all preceding SENDs have been segmentized, then + form a FIN segment and send it. In any case, enter FIN-WAIT-1 + state. + + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + + Strictly speaking, this is an error and should receive a "error: + connection closing" response. An "ok" response would be + acceptable, too, as long as a second FIN is not emitted (the first + FIN may be retransmitted though). + + + + + + + + + + + +[Page 60] + + +September 1981 + Transmission Control Protocol + Functional Specification +CLOSE Call + + + + CLOSE-WAIT STATE + + Queue this request until all preceding SENDs have been + segmentized; then send a FIN segment, enter CLOSING state. + + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + Respond with "error: connection closing". + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page 61] + + + September 1981 +Transmission Control Protocol +Functional Specification + ABORT Call + + + + ABORT Call + + CLOSED STATE (i.e., TCB does not exist) + + If the user should not have access to such a connection, return + "error: connection illegal for this process". + + Otherwise return "error: connection does not exist". + + LISTEN STATE + + Any outstanding RECEIVEs should be returned with "error: + connection reset" responses. Delete TCB, enter CLOSED state, and + return. + + SYN-SENT STATE + + All queued SENDs and RECEIVEs should be given "connection reset" + notification, delete the TCB, enter CLOSED state, and return. + + SYN-RECEIVED STATE + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + CLOSE-WAIT STATE + + Send a reset segment: + + + + All queued SENDs and RECEIVEs should be given "connection reset" + notification; all segments queued for transmission (except for the + RST formed above) or retransmission should be flushed, delete the + TCB, enter CLOSED state, and return. + + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + Respond with "ok" and delete the TCB, enter CLOSED state, and + return. + + + + + + + + +[Page 62] + + +September 1981 + Transmission Control Protocol + Functional Specification +STATUS Call + + + + STATUS Call + + CLOSED STATE (i.e., TCB does not exist) + + If the user should not have access to such a connection, return + "error: connection illegal for this process". + + Otherwise return "error: connection does not exist". + + LISTEN STATE + + Return "state = LISTEN", and the TCB pointer. + + SYN-SENT STATE + + Return "state = SYN-SENT", and the TCB pointer. + + SYN-RECEIVED STATE + + Return "state = SYN-RECEIVED", and the TCB pointer. + + ESTABLISHED STATE + + Return "state = ESTABLISHED", and the TCB pointer. + + FIN-WAIT-1 STATE + + Return "state = FIN-WAIT-1", and the TCB pointer. + + FIN-WAIT-2 STATE + + Return "state = FIN-WAIT-2", and the TCB pointer. + + CLOSE-WAIT STATE + + Return "state = CLOSE-WAIT", and the TCB pointer. + + CLOSING STATE + + Return "state = CLOSING", and the TCB pointer. + + LAST-ACK STATE + + Return "state = LAST-ACK", and the TCB pointer. + + + + + + [Page 63] + + + September 1981 +Transmission Control Protocol +Functional Specification + STATUS Call + + + + TIME-WAIT STATE + + Return "state = TIME-WAIT", and the TCB pointer. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[Page 64] + + +September 1981 + Transmission Control Protocol + Functional Specification +SEGMENT ARRIVES + + + + SEGMENT ARRIVES + + If the state is CLOSED (i.e., TCB does not exist) then + + all data in the incoming segment is discarded. An incoming + segment containing a RST is discarded. An incoming segment not + containing a RST causes a RST to be sent in response. The + acknowledgment and sequence field values are selected to make the + reset sequence acceptable to the TCP that sent the offending + segment. + + If the ACK bit is off, sequence number zero is used, + + + + If the ACK bit is on, + + + + Return. + + If the state is LISTEN then + + first check for an RST + + An incoming RST should be ignored. Return. + + second check for an ACK + + Any acknowledgment is bad if it arrives on a connection still in + the LISTEN state. An acceptable reset segment should be formed + for any arriving ACK-bearing segment. The RST should be + formatted as follows: + + + + Return. + + third check for a SYN + + If the SYN bit is set, check the security. If the + security/compartment on the incoming segment does not exactly + match the security/compartment in the TCB then send a reset and + return. + + + + + + [Page 65] + + + September 1981 +Transmission Control Protocol +Functional Specification + SEGMENT ARRIVES + + + + If the SEG.PRC is greater than the TCB.PRC then if allowed by + the user and the system set TCB.PRC<-SEG.PRC, if not allowed + send a reset and return. + + + + If the SEG.PRC is less than the TCB.PRC then continue. + + Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ and any other + control or text should be queued for processing later. ISS + should be selected and a SYN segment sent of the form: + + + + SND.NXT is set to ISS+1 and SND.UNA to ISS. The connection + state should be changed to SYN-RECEIVED. Note that any other + incoming control or data (combined with SYN) will be processed + in the SYN-RECEIVED state, but processing of SYN and ACK should + not be repeated. If the listen was not fully specified (i.e., + the foreign socket was not fully specified), then the + unspecified fields should be filled in now. + + fourth other text or control + + Any other control or text-bearing segment (not containing SYN) + must have an ACK and thus would be discarded by the ACK + processing. An incoming RST segment could not be valid, since + it could not have been sent in response to anything sent by this + incarnation of the connection. So you are unlikely to get here, + but if you do, drop the segment, and return. + + If the state is SYN-SENT then + + first check the ACK bit + + If the ACK bit is set + + If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless + the RST bit is set, if so drop the segment and return) + + + + and discard the segment. Return. + + If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + + second check the RST bit + + +[Page 66] + + +September 1981 + Transmission Control Protocol + Functional Specification +SEGMENT ARRIVES + + + + If the RST bit is set + + If the ACK was acceptable then signal the user "error: + connection reset", drop the segment, enter CLOSED state, + delete TCB, and return. Otherwise (no ACK) drop the segment + and return. + + third check the security and precedence + + If the security/compartment in the segment does not exactly + match the security/compartment in the TCB, send a reset + + If there is an ACK + + + + Otherwise + + + + If there is an ACK + + The precedence in the segment must match the precedence in the + TCB, if not, send a reset + + + + If there is no ACK + + If the precedence in the segment is higher than the precedence + in the TCB then if allowed by the user and the system raise + the precedence in the TCB to that in the segment, if not + allowed to raise the prec then send a reset. + + + + If the precedence in the segment is lower than the precedence + in the TCB continue. + + If a reset was sent, discard the segment and return. + + fourth check the SYN bit + + This step should be reached only if the ACK is ok, or there is + no ACK, and it the segment did not contain a RST. + + If the SYN bit is on and the security/compartment and precedence + + + [Page 67] + + + September 1981 +Transmission Control Protocol +Functional Specification + SEGMENT ARRIVES + + + + are acceptable then, RCV.NXT is set to SEG.SEQ+1, IRS is set to + SEG.SEQ. SND.UNA should be advanced to equal SEG.ACK (if there + is an ACK), and any segments on the retransmission queue which + are thereby acknowledged should be removed. + + If SND.UNA > ISS (our SYN has been ACKed), change the connection + state to ESTABLISHED, form an ACK segment + + + + and send it. Data or controls which were queued for + transmission may be included. If there are other controls or + text in the segment then continue processing at the sixth step + below where the URG bit is checked, otherwise return. + + Otherwise enter SYN-RECEIVED, form a SYN,ACK segment + + + + and send it. If there are other controls or text in the + segment, queue them for processing after the ESTABLISHED state + has been reached, return. + + fifth, if neither of the SYN or RST bits is set then drop the + segment and return. + + + + + + + + + + + + + + + + + + + + + + + + +[Page 68] + + +September 1981 + Transmission Control Protocol + Functional Specification +SEGMENT ARRIVES + + + + Otherwise, + + first check sequence number + + SYN-RECEIVED STATE + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + CLOSE-WAIT STATE + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + Segments are processed in sequence. Initial tests on arrival + are used to discard old duplicates, but further processing is + done in SEG.SEQ order. If a segment's contents straddle the + boundary between old and new, only the new parts should be + processed. + + There are four cases for the acceptability test for an incoming + segment: + + Segment Receive Test + Length Window + ------- ------- ------------------------------------------- + + 0 0 SEG.SEQ = RCV.NXT + + 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + + >0 0 not acceptable + + >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + + If the RCV.WND is zero, no segments will be acceptable, but + special allowance should be made to accept valid ACKs, URGs and + RSTs. + + If an incoming segment is not acceptable, an acknowledgment + should be sent in reply (unless the RST bit is set, if so drop + the segment and return): + + + + After sending the acknowledgment, drop the unacceptable segment + and return. + + + [Page 69] + + + September 1981 +Transmission Control Protocol +Functional Specification + SEGMENT ARRIVES + + + + In the following it is assumed that the segment is the idealized + segment that begins at RCV.NXT and does not exceed the window. + One could tailor actual segments to fit this assumption by + trimming off any portions that lie outside the window (including + SYN and FIN), and only processing further if the segment then + begins at RCV.NXT. Segments with higher begining sequence + numbers may be held for later processing. + + second check the RST bit, + + SYN-RECEIVED STATE + + If the RST bit is set + + If this connection was initiated with a passive OPEN (i.e., + came from the LISTEN state), then return this connection to + LISTEN state and return. The user need not be informed. If + this connection was initiated with an active OPEN (i.e., came + from SYN-SENT state) then the connection was refused, signal + the user "connection refused". In either case, all segments + on the retransmission queue should be removed. And in the + active OPEN case, enter the CLOSED state and delete the TCB, + and return. + + ESTABLISHED + FIN-WAIT-1 + FIN-WAIT-2 + CLOSE-WAIT + + If the RST bit is set then, any outstanding RECEIVEs and SEND + should receive "reset" responses. All segment queues should be + flushed. Users should also receive an unsolicited general + "connection reset" signal. Enter the CLOSED state, delete the + TCB, and return. + + CLOSING STATE + LAST-ACK STATE + TIME-WAIT + + If the RST bit is set then, enter the CLOSED state, delete the + TCB, and return. + + + + + + + + +[Page 70] + + +September 1981 + Transmission Control Protocol + Functional Specification +SEGMENT ARRIVES + + + + third check security and precedence + + SYN-RECEIVED + + If the security/compartment and precedence in the segment do not + exactly match the security/compartment and precedence in the TCB + then send a reset, and return. + + ESTABLISHED STATE + + If the security/compartment and precedence in the segment do not + exactly match the security/compartment and precedence in the TCB + then send a reset, any outstanding RECEIVEs and SEND should + receive "reset" responses. All segment queues should be + flushed. Users should also receive an unsolicited general + "connection reset" signal. Enter the CLOSED state, delete the + TCB, and return. + + Note this check is placed following the sequence check to prevent + a segment from an old connection between these ports with a + different security or precedence from causing an abort of the + current connection. + + fourth, check the SYN bit, + + SYN-RECEIVED + ESTABLISHED STATE + FIN-WAIT STATE-1 + FIN-WAIT STATE-2 + CLOSE-WAIT STATE + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + If the SYN is in the window it is an error, send a reset, any + outstanding RECEIVEs and SEND should receive "reset" responses, + all segment queues should be flushed, the user should also + receive an unsolicited general "connection reset" signal, enter + the CLOSED state, delete the TCB, and return. + + If the SYN is not in the window this step would not be reached + and an ack would have been sent in the first step (sequence + number check). + + + + + + + [Page 71] + + + September 1981 +Transmission Control Protocol +Functional Specification + SEGMENT ARRIVES + + + + fifth check the ACK field, + + if the ACK bit is off drop the segment and return + + if the ACK bit is on + + SYN-RECEIVED STATE + + If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state + and continue processing. + + If the segment acknowledgment is not acceptable, form a + reset segment, + + + + and send it. + + ESTABLISHED STATE + + If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK. + Any segments on the retransmission queue which are thereby + entirely acknowledged are removed. Users should receive + positive acknowledgments for buffers which have been SENT and + fully acknowledged (i.e., SEND buffer should be returned with + "ok" response). If the ACK is a duplicate + (SEG.ACK < SND.UNA), it can be ignored. If the ACK acks + something not yet sent (SEG.ACK > SND.NXT) then send an ACK, + drop the segment, and return. + + If SND.UNA < SEG.ACK =< SND.NXT, the send window should be + updated. If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and + SND.WL2 =< SEG.ACK)), set SND.WND <- SEG.WND, set + SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK. + + Note that SND.WND is an offset from SND.UNA, that SND.WL1 + records the sequence number of the last segment used to update + SND.WND, and that SND.WL2 records the acknowledgment number of + the last segment used to update SND.WND. The check here + prevents using old segments to update the window. + + + + + + + + + +[Page 72] + + +September 1981 + Transmission Control Protocol + Functional Specification +SEGMENT ARRIVES + + + + FIN-WAIT-1 STATE + + In addition to the processing for the ESTABLISHED state, if + our FIN is now acknowledged then enter FIN-WAIT-2 and continue + processing in that state. + + FIN-WAIT-2 STATE + + In addition to the processing for the ESTABLISHED state, if + the retransmission queue is empty, the user's CLOSE can be + acknowledged ("ok") but do not delete the TCB. + + CLOSE-WAIT STATE + + Do the same processing as for the ESTABLISHED state. + + CLOSING STATE + + In addition to the processing for the ESTABLISHED state, if + the ACK acknowledges our FIN then enter the TIME-WAIT state, + otherwise ignore the segment. + + LAST-ACK STATE + + The only thing that can arrive in this state is an + acknowledgment of our FIN. If our FIN is now acknowledged, + delete the TCB, enter the CLOSED state, and return. + + TIME-WAIT STATE + + The only thing that can arrive in this state is a + retransmission of the remote FIN. Acknowledge it, and restart + the 2 MSL timeout. + + sixth, check the URG bit, + + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + + If the URG bit is set, RCV.UP <- max(RCV.UP,SEG.UP), and signal + the user that the remote side has urgent data if the urgent + pointer (RCV.UP) is in advance of the data consumed. If the + user has already been signaled (or is still in the "urgent + mode") for this continuous sequence of urgent data, do not + signal the user again. + + + + [Page 73] + + + September 1981 +Transmission Control Protocol +Functional Specification + SEGMENT ARRIVES + + + + CLOSE-WAIT STATE + CLOSING STATE + LAST-ACK STATE + TIME-WAIT + + This should not occur, since a FIN has been received from the + remote side. Ignore the URG. + + seventh, process the segment text, + + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + + Once in the ESTABLISHED state, it is possible to deliver segment + text to user RECEIVE buffers. Text from segments can be moved + into buffers until either the buffer is full or the segment is + empty. If the segment empties and carries an PUSH flag, then + the user is informed, when the buffer is returned, that a PUSH + has been received. + + When the TCP takes responsibility for delivering the data to the + user it must also acknowledge the receipt of the data. + + Once the TCP takes responsibility for the data it advances + RCV.NXT over the data accepted, and adjusts RCV.WND as + apporopriate to the current buffer availability. The total of + RCV.NXT and RCV.WND should not be reduced. + + Please note the window management suggestions in section 3.7. + + Send an acknowledgment of the form: + + + + This acknowledgment should be piggybacked on a segment being + transmitted if possible without incurring undue delay. + + + + + + + + + + + + +[Page 74] + + +September 1981 + Transmission Control Protocol + Functional Specification +SEGMENT ARRIVES + + + + CLOSE-WAIT STATE + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + This should not occur, since a FIN has been received from the + remote side. Ignore the segment text. + + eighth, check the FIN bit, + + Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT + since the SEG.SEQ cannot be validated; drop the segment and + return. + + If the FIN bit is set, signal the user "connection closing" and + return any pending RECEIVEs with same message, advance RCV.NXT + over the FIN, and send an acknowledgment for the FIN. Note that + FIN implies PUSH for any segment text not yet delivered to the + user. + + SYN-RECEIVED STATE + ESTABLISHED STATE + + Enter the CLOSE-WAIT state. + + FIN-WAIT-1 STATE + + If our FIN has been ACKed (perhaps in this segment), then + enter TIME-WAIT, start the time-wait timer, turn off the other + timers; otherwise enter the CLOSING state. + + FIN-WAIT-2 STATE + + Enter the TIME-WAIT state. Start the time-wait timer, turn + off the other timers. + + CLOSE-WAIT STATE + + Remain in the CLOSE-WAIT state. + + CLOSING STATE + + Remain in the CLOSING state. + + LAST-ACK STATE + + Remain in the LAST-ACK state. + + + [Page 75] + + + September 1981 +Transmission Control Protocol +Functional Specification + SEGMENT ARRIVES + + + + TIME-WAIT STATE + + Remain in the TIME-WAIT state. Restart the 2 MSL time-wait + timeout. + + and return. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[Page 76] + + +September 1981 + Transmission Control Protocol + Functional Specification +USER TIMEOUT + + + + USER TIMEOUT + + For any state if the user timeout expires, flush all queues, signal + the user "error: connection aborted due to user timeout" in general + and for any outstanding calls, delete the TCB, enter the CLOSED + state and return. + + RETRANSMISSION TIMEOUT + + For any state if the retransmission timeout expires on a segment in + the retransmission queue, send the segment at the front of the + retransmission queue again, reinitialize the retransmission timer, + and return. + + TIME-WAIT TIMEOUT + + If the time-wait timeout expires on a connection delete the TCB, + enter the CLOSED state and return. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page 77] + + + September 1981 +Transmission Control Protocol + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[Page 78] + + +September 1981 + Transmission Control Protocol + + + + GLOSSARY + + + +1822 + BBN Report 1822, "The Specification of the Interconnection of + a Host and an IMP". The specification of interface between a + host and the ARPANET. + +ACK + A control bit (acknowledge) occupying no sequence space, which + indicates that the acknowledgment field of this segment + specifies the next sequence number the sender of this segment + is expecting to receive, hence acknowledging receipt of all + previous sequence numbers. + +ARPANET message + The unit of transmission between a host and an IMP in the + ARPANET. The maximum size is about 1012 octets (8096 bits). + +ARPANET packet + A unit of transmission used internally in the ARPANET between + IMPs. The maximum size is about 126 octets (1008 bits). + +connection + A logical communication path identified by a pair of sockets. + +datagram + A message sent in a packet switched computer communications + network. + +Destination Address + The destination address, usually the network and host + identifiers. + +FIN + A control bit (finis) occupying one sequence number, which + indicates that the sender will send no more data or control + occupying sequence space. + +fragment + A portion of a logical unit of data, in particular an internet + fragment is a portion of an internet datagram. + +FTP + A file transfer protocol. + + + + + + [Page 79] + + + September 1981 +Transmission Control Protocol +Glossary + + + +header + Control information at the beginning of a message, segment, + fragment, packet or block of data. + +host + A computer. In particular a source or destination of messages + from the point of view of the communication network. + +Identification + An Internet Protocol field. This identifying value assigned + by the sender aids in assembling the fragments of a datagram. + +IMP + The Interface Message Processor, the packet switch of the + ARPANET. + +internet address + A source or destination address specific to the host level. + +internet datagram + The unit of data exchanged between an internet module and the + higher level protocol together with the internet header. + +internet fragment + A portion of the data of an internet datagram with an internet + header. + +IP + Internet Protocol. + +IRS + The Initial Receive Sequence number. The first sequence + number used by the sender on a connection. + +ISN + The Initial Sequence Number. The first sequence number used + on a connection, (either ISS or IRS). Selected on a clock + based procedure. + +ISS + The Initial Send Sequence number. The first sequence number + used by the sender on a connection. + +leader + Control information at the beginning of a message or block of + data. In particular, in the ARPANET, the control information + on an ARPANET message at the host-IMP interface. + + + +[Page 80] + + +September 1981 + Transmission Control Protocol + Glossary + + + +left sequence + This is the next sequence number to be acknowledged by the + data receiving TCP (or the lowest currently unacknowledged + sequence number) and is sometimes referred to as the left edge + of the send window. + +local packet + The unit of transmission within a local network. + +module + An implementation, usually in software, of a protocol or other + procedure. + +MSL + Maximum Segment Lifetime, the time a TCP segment can exist in + the internetwork system. Arbitrarily defined to be 2 minutes. + +octet + An eight bit byte. + +Options + An Option field may contain several options, and each option + may be several octets in length. The options are used + primarily in testing situations; for example, to carry + timestamps. Both the Internet Protocol and TCP provide for + options fields. + +packet + A package of data with a header which may or may not be + logically complete. More often a physical packaging than a + logical packaging of data. + +port + The portion of a socket that specifies which logical input or + output channel of a process is associated with the data. + +process + A program in execution. A source or destination of data from + the point of view of the TCP or other host-to-host protocol. + +PUSH + A control bit occupying no sequence space, indicating that + this segment contains data that must be pushed through to the + receiving user. + +RCV.NXT + receive next sequence number + + + + [Page 81] + + + September 1981 +Transmission Control Protocol +Glossary + + + +RCV.UP + receive urgent pointer + +RCV.WND + receive window + +receive next sequence number + This is the next sequence number the local TCP is expecting to + receive. + +receive window + This represents the sequence numbers the local (receiving) TCP + is willing to receive. Thus, the local TCP considers that + segments overlapping the range RCV.NXT to + RCV.NXT + RCV.WND - 1 carry acceptable data or control. + Segments containing sequence numbers entirely outside of this + range are considered duplicates and discarded. + +RST + A control bit (reset), occupying no sequence space, indicating + that the receiver should delete the connection without further + interaction. The receiver can determine, based on the + sequence number and acknowledgment fields of the incoming + segment, whether it should honor the reset command or ignore + it. In no case does receipt of a segment containing RST give + rise to a RST in response. + +RTP + Real Time Protocol: A host-to-host protocol for communication + of time critical information. + +SEG.ACK + segment acknowledgment + +SEG.LEN + segment length + +SEG.PRC + segment precedence value + +SEG.SEQ + segment sequence + +SEG.UP + segment urgent pointer field + + + + + +[Page 82] + + +September 1981 + Transmission Control Protocol + Glossary + + + +SEG.WND + segment window field + +segment + A logical unit of data, in particular a TCP segment is the + unit of data transfered between a pair of TCP modules. + +segment acknowledgment + The sequence number in the acknowledgment field of the + arriving segment. + +segment length + The amount of sequence number space occupied by a segment, + including any controls which occupy sequence space. + +segment sequence + The number in the sequence field of the arriving segment. + +send sequence + This is the next sequence number the local (sending) TCP will + use on the connection. It is initially selected from an + initial sequence number curve (ISN) and is incremented for + each octet of data or sequenced control transmitted. + +send window + This represents the sequence numbers which the remote + (receiving) TCP is willing to receive. It is the value of the + window field specified in segments from the remote (data + receiving) TCP. The range of new sequence numbers which may + be emitted by a TCP lies between SND.NXT and + SND.UNA + SND.WND - 1. (Retransmissions of sequence numbers + between SND.UNA and SND.NXT are expected, of course.) + +SND.NXT + send sequence + +SND.UNA + left sequence + +SND.UP + send urgent pointer + +SND.WL1 + segment sequence number at last window update + +SND.WL2 + segment acknowledgment number at last window update + + + + [Page 83] + + + September 1981 +Transmission Control Protocol +Glossary + + + +SND.WND + send window + +socket + An address which specifically includes a port identifier, that + is, the concatenation of an Internet Address with a TCP port. + +Source Address + The source address, usually the network and host identifiers. + +SYN + A control bit in the incoming segment, occupying one sequence + number, used at the initiation of a connection, to indicate + where the sequence numbering will start. + +TCB + Transmission control block, the data structure that records + the state of a connection. + +TCB.PRC + The precedence of the connection. + +TCP + Transmission Control Protocol: A host-to-host protocol for + reliable communication in internetwork environments. + +TOS + Type of Service, an Internet Protocol field. + +Type of Service + An Internet Protocol field which indicates the type of service + for this internet fragment. + +URG + A control bit (urgent), occupying no sequence space, used to + indicate that the receiving user should be notified to do + urgent processing as long as there is data to be consumed with + sequence numbers less than the value indicated in the urgent + pointer. + +urgent pointer + A control field meaningful only when the URG bit is on. This + field communicates the value of the urgent pointer which + indicates the data octet associated with the sending user's + urgent call. + + + + + +[Page 84] + + +September 1981 + Transmission Control Protocol + + + + REFERENCES + + + +[1] Cerf, V., and R. Kahn, "A Protocol for Packet Network + Intercommunication", IEEE Transactions on Communications, + Vol. COM-22, No. 5, pp 637-648, May 1974. + +[2] Postel, J. (ed.), "Internet Protocol - DARPA Internet Program + Protocol Specification", RFC 791, USC/Information Sciences + Institute, September 1981. + +[3] Dalal, Y. and C. Sunshine, "Connection Management in Transport + Protocols", Computer Networks, Vol. 2, No. 6, pp. 454-473, + December 1978. + +[4] Postel, J., "Assigned Numbers", RFC 790, USC/Information Sciences + Institute, September 1981. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [Page 85] + diff --git a/ext/picotcp/RFC/rfc1066.txt b/ext/picotcp/RFC/rfc1066.txt new file mode 100644 index 0000000..66aae55 --- /dev/null +++ b/ext/picotcp/RFC/rfc1066.txt @@ -0,0 +1,5043 @@ + + + + + + +Network Working Group K. McCloghrie +Request For Comments: 1066 M. Rose + TWG + August 1988 + + + Management Information Base for Network Management + of TCP/IP-based internets + + Table of Contents + + 1. Status of this Memo ................................... 1 + 2. IAB POLICY STATEMENT .................................. 2 + 3. Introduction .......................................... 2 + 4. Objects ............................................... 5 + 4.1 Object Groups ........................................ 5 + 4.2 Format of Definitions ................................ 6 + 5. Object Definitions .................................... 7 + 5.1 The System Group ..................................... 8 + 5.2 The Interfaces Group ................................. 10 + 5.2.1 The Interfaces Table ............................... 10 + 5.3 The Address Translation Group ........................ 22 + 5.4 The IP Group ......................................... 25 + 5.4.1 The IP Address Table ............................... 33 + 5.4.2 The IP Routing Table ............................... 35 + 5.5 The ICMP Group ....................................... 42 + 5.6 The TCP Group ........................................ 52 + 5.7 The UDP Group ........................................ 61 + 5.8 The EGP Group ........................................ 63 + 5.8.1 The EGP Neighbor Table ............................. 64 + 6. Definitions ........................................... 67 + 7. Acknowledgements ...................................... 88 + 8. References ............................................ 89 + +1. Status of this Memo + + This memo provides the initial version of the Management Information + Base (MIB) for use with network management protocols in TCP/IP-based + internets in the short-term. In particular, together with its + companion memos which describe the structure of management + information along with the initial network management protocol, these + documents provide a simple, workable architecture and system for + managing TCP/IP-based internets and in particular the Internet. + + + + + + + + +McCloghrie & Rose [Page 1] + +RFC 1066 MIB August 1988 + + + This memo specifies a draft standard for the Internet community. + TCP/IP implementations in the Internet which are network manageable + are expected to adopt and implement this specification. + + Distribution of this memo is unlimited. + +2. IAB POLICY STATEMENT + + This MIB specification is the first edition of an evolving document + defining variables needed for monitoring and control of various + components of the Internet. Not all groups of defined variables are + mandatory for all Internet components. + + For example, the EGP group is mandatory for gateways using EGP but + not for hosts which should not be running EGP. Similarly, the TCP + group is mandatory for hosts running TCP but not for gateways which + aren't running it. What IS mandatory, however, is that all variables + of a group be supported if any element of the group is supported. + + It is expected that additional MIB groups and variables will be + defined over time to accommodate the monitoring and control needs of + new or changing components of the Internet. The MIB working group + will continue to refine this specification and projects a revision + incorporating new requirements in early 1989. + +3. Introduction + + As reported in RFC 1052, IAB Recommendations for the Development of + Internet Network Management Standards [1], the Internet Activities + Board has directed the Internet Engineering Task Force (IETF) to + create two new working groups in the area of network management. One + group is charged with the further specification and definition of + elements to be included in the Management Information Base. The + other is charged with defining the modifications to the Simple + Network Management Protocol (SNMP) to accommodate the short-term + needs of the network vendor and operator communities. The long-term + needs of the Internet community are to be met using the ISO CMIS/CMIP + [2,3] framework as a basis. An existing IETF working group, the + "NETMAN" group, is already engaged in defining the use of CMIS/CMIP + in a TCP/IP network, and will continue with responsibility for + addressing the longer-term requirements. + + The output of the MIB working group is to be provided to both the + SNMP working group and the NETMAN group, so as to ensure + compatibility of monitored items for both network management + frameworks. + + The MIB working group has produced this memo and a companion. The + + + +McCloghrie & Rose [Page 2] + +RFC 1066 MIB August 1988 + + + companion memo [4] defines a Structure for Management Information + (SMI) for use by the managed objects contained in the MIB. This memo + defines the list of managed objects. + + The IAB also urged the working groups to be "extremely sensitive to + the need to keep SNMP simple," and recommends that the MIB working + group take as its starting inputs the MIB definitions found in the + High-Level Entity Management Systems (HEMS) RFC 1024 [5], the initial + SNMP specification [6], and the CMIS/CMIP memos [7,8]. + + Thus, the list of managed objects defined here, has been derived by + taking only those elements which are considered essential. Since + such elements are essential, there is no need to allow the + implementation of individual objects, to be optional. Rather, all + compliant implementations will contain all applicable (see below) + objects defined in this memo. + + This approach of taking only the essential objects is NOT + restrictive, since the SMI defined in the companion memo provides + three extensibility mechanisms: one, the addition of new standard + objects through the definitions of new versions of the MIB; two, the + addition of widely-available but non-standard objects through the + multilateral subtree; and three, the addition of private objects + through the enterprises subtree. Such additional objects can not only + be used for vendor-specific elements, but also for experimentation as + required to further the knowledge of which other objects are + essential. + + The primary criterion for being considered essential was for an + object to be contained in all of the above referenced MIB + definitions. A few other objects have been included, but only if the + MIB working group believed they are truly essential. The detailed + list of criteria against which potential inclusions in this (initial) + MIB were considered, was: + + 1) An object needed to be essential for either fault or + configuration management. + + 2) Only weak control objects were permitted (by weak, it + is meant that tampering with them can do only limited + damage). This criterion reflects the fact that the + current management protocols are not sufficiently secure + to do more powerful control operations. + + 3) Evidence of current use and utility was required. + + 4) An attempt was made to limit the number of objects to + about 100 to make it easier for vendors to fully + + + +McCloghrie & Rose [Page 3] + +RFC 1066 MIB August 1988 + + + instrument their software. + + 5) To avoid redundant variables, it was required that no + object be included that can be derived from others in the + MIB. + + 6) Implementation specific objects (e.g., for BSD UNIX) + were excluded. + + 7) It was agreed to avoid heavily instrumenting critical + sections of code. The general guideline was one counter + per critical section per layer. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 4] + +RFC 1066 MIB August 1988 + + +4. Objects + + Managed objects are accessed via a virtual information store, termed + the Management Information Base or MIB. Objects in the MIB are + defined using Abstract Syntax Notation One (ASN.1) [9]. + + The mechanisms used for describing these objects are specified in the + companion memo. In particular, each object has a name, a syntax, and + an encoding. The name is an object identifier, an administratively + assigned name, which specifies an object type. The object type + together with an object instance serves to uniquely identify a + specific instantiation of the object. For human convenience, we + often use a textual string, termed the OBJECT DESCRIPTOR, to also + refer to the object type. + + The syntax of an object type defines the abstract data structure + corresponding to that object type. The ASN.1 language is used for + this purpose. However, the companion memo purposely restricts the + ASN.1 constructs which may be used. These restrictions are + explicitly made for simplicity. + + The encoding of an object type is simply how that object type is + represented using the object type's syntax. Implicitly tied to the + notion of an object type's syntax and encoding is how the object type + is represented when being transmitted on the network. This memo + specifies the use of the basic encoding rules of ASN.1 [10]. + +4.1. Object Groups + + Since this list of managed objects contains only the essential + elements, there is no need to allow individual objects to be + optional. Rather, the objects are arranged into the following + groups: + + - System + - Interfaces + - Address Translation + - IP + - ICMP + - TCP + - UDP + - EGP + + There are two reasons for defining these groups: one, to provide a + means of assigning object identifiers; two, to provide a method for + implementations of managed agents to know which objects they must + implement. This method is as follows: if the semantics of a group is + applicable to an implementation, then it must implement all objects + + + +McCloghrie & Rose [Page 5] + +RFC 1066 MIB August 1988 + + + in that group. For example, an implementation must implement the EGP + group if and only if it implements the EGP protocol. + +4.2. Format of Definitions + + The next section contains the specification of all object types + contained in the MIB. Following the conventions of the companion + memo, the object types are defined using the following fields: + + OBJECT: + ------- + A textual name, termed the OBJECT DESCRIPTOR, for the + object type, along with its corresponding OBJECT + IDENTIFIER. + + Syntax: + The abstract syntax for the object type, presented using + ASN.1. This must resolve to an instance of the ASN.1 + type ObjectSyntax defined in the SMI. + + Definition: + A textual description of the semantics of the object + type. Implementations should ensure that their + interpretation of the object type fulfills this + definition since this MIB is intended for use in multi- + vendor environments. As such it is vital that object + types have consistent meaning across all machines. + + Access: + One of read-only, read-write, write-only, or + not-accessible. + + Status: + One of mandatory, optional, or obsolete. + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 6] + +RFC 1066 MIB August 1988 + + +5. Object Definitions + + RFC1066-MIB { iso org(3) dod(6) internet(1) mgmt(2) 1 } + + DEFINITIONS ::= BEGIN + + IMPORTS + mgmt, OBJECT-TYPE, NetworkAddress, IpAddress, + Counter, Gauge, TimeTicks + FROM RFC1065-SMI; + + mib OBJECT IDENTIFIER ::= { mgmt 1 } + + system OBJECT IDENTIFIER ::= { mib 1 } + interfaces OBJECT IDENTIFIER ::= { mib 2 } + at OBJECT IDENTIFIER ::= { mib 3 } + ip OBJECT IDENTIFIER ::= { mib 4 } + icmp OBJECT IDENTIFIER ::= { mib 5 } + tcp OBJECT IDENTIFIER ::= { mib 6 } + udp OBJECT IDENTIFIER ::= { mib 7 } + egp OBJECT IDENTIFIER ::= { mib 8 } + + END + + + + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 7] + +RFC 1066 MIB August 1988 + + +5.1. The System Group + + Implementation of the System group is mandatory for all + systems. + + OBJECT: + ------- + sysDescr { system 1 } + + Syntax: + OCTET STRING + + Definition: + A textual description of the entity. This value should + include the full name and version identification of the + system's hardware type, software operating-system, and + networking software. It is mandatory that this only + contain printable ASCII characters. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + sysObjectID { system 2 } + + Syntax: + OBJECT IDENTIFIER + + Definition: + The vendor's authoritative identification of the network + management subsystem contained in the entity. This value + is allocated within the SMI enterprises subtree + (1.3.6.1.4.1) and provides an easy and unambiguous means + for determining "what kind of box" is being managed. For + example, if vendor "Flintstones, Inc." was assigned the + subtree 1.3.6.1.4.1.42, it could assign the identifier + 1.3.6.1.4.1.42.1.1 to its "Fred Router". + + Access: + read-only. + + Status: + mandatory. + + + +McCloghrie & Rose [Page 8] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + sysUpTime { system 3 } + + Syntax: + TimeTicks + + Definition: + The time (in hundredths of a second) since the network + management portion of the system was last re-initialized. + + Access: + read-only. + + Status: + mandatory. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 9] + +RFC 1066 MIB August 1988 + + +5.2. The Interfaces Group + + Implementation of the Interfaces group is mandatory for all + systems. + + OBJECT: + ------- + ifNumber { interfaces 1 } + + Syntax: + INTEGER + + Definition: + The number of network interfaces (regardless of their + current state) on which this system can send/receive IP + datagrams. + + Access: + read-only. + + Status: + mandatory. + +5.2.1. The Interfaces Table + + OBJECT: + ------- + ifTable { interfaces 2 } + + Syntax: + SEQUENCE OF IfEntry + + Definition: + A list of interface entries. The number of entries is + given by the value of ifNumber. + + Access: + read-write. + + Status: + mandatory. + + OBJECT: + ------- + ifEntry { ifTable 1 } + + Syntax: + IfEntry ::= SEQUENCE { + + + +McCloghrie & Rose [Page 10] + +RFC 1066 MIB August 1988 + + + ifIndex + INTEGER, + ifDescr + OCTET STRING, + ifType + INTEGER, + ifMtu + INTEGER, + ifSpeed + Gauge, + ifPhysAddress + OCTET STRING, + ifAdminStatus + INTEGER, + ifOperStatus + INTEGER, + ifLastChange + TimeTicks, + ifInOctets + Counter, + ifInUcastPkts + Counter, + ifInNUcastPkts + Counter, + ifInDiscards + Counter, + ifInErrors + Counter, + ifInUnknownProtos + Counter, + ifOutOctets + Counter, + ifOutUcastPkts + Counter, + ifOutNUcastPkts + Counter, + ifOutDiscards + Counter, + ifOutErrors + Counter, + ifOutQLen + Gauge + } + + Definition: + An interface entry containing objects at the subnetwork + layer and below for a particular interface. + + + + +McCloghrie & Rose [Page 11] + +RFC 1066 MIB August 1988 + + + Access: + read-write. + + Status: + mandatory. + + + We now consider the individual components of each interface + entry: + + + OBJECT: + ------- + ifIndex { ifEntry 1 } + + Syntax: + INTEGER + + Definition: + A unique value for each interface. Its value ranges + between 1 and the value of ifNumber. The value for each + interface must remain constant at least from one re- + initialization of the entity's network management system + to the next re-initialization. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifDescr { ifEntry 2 } + + Syntax: + OCTET STRING + + Definition: + A text string containing information about the interface. + This string should include the name of the manufacturer, + the product name and the version of the hardware + interface. The string is intended for presentation to a + human; it must not contain anything but printable ASCII + characters. + + + + + +McCloghrie & Rose [Page 12] + +RFC 1066 MIB August 1988 + + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifType { ifEntry 3 } + + Syntax: + INTEGER { + other(1), -- none of the following + regular1822(2), + hdh1822(3), + ddn-x25(4), + rfc877-x25(5), + ethernet-csmacd(6), + iso88023-csmacd(7), + iso88024-tokenBus(8), + iso88025-tokenRing(9), + iso88026-man(10), + starLan(11), + proteon-10MBit(12), + proteon-80MBit(13), + hyperchannel(14), + fddi(15), + lapb(16), + sdlc(17), + t1-carrier(18), + cept(19), -- european equivalent of T-1 + basicIsdn(20), + primaryIsdn(21), + -- proprietary serial + propPointToPointSerial(22) + } + + Definition: + The type of interface, distinguished according to the + physical/link/network protocol(s) immediately "below" IP + in the protocol stack. + + Access: + read-only. + + Status: + mandatory. + + + +McCloghrie & Rose [Page 13] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ifMtu { ifEntry 4 } + + Syntax: + INTEGER + + Definition: + The size of the largest IP datagram which can be + sent/received on the interface, specified in octets. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifSpeed { ifEntry 5 } + + Syntax: + Gauge + + Definition: + An estimate of the interface's current bandwidth in bits + per second. For interfaces which do not vary in + bandwidth or for those where no accurate estimation can + be made, this object should contain the nominal + bandwidth. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifPhysAddress { ifEntry 6 } + + Syntax: + OCTET STRING + + Definition: + The interface's address at the protocol layer immediately + + + +McCloghrie & Rose [Page 14] + +RFC 1066 MIB August 1988 + + + "below" IP in the protocol stack. For interfaces which + do not have such an address (e.g., a serial line), this + object should contain an octet string of zero length. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifAdminStatus { ifEntry 7 } + + Syntax: + INTEGER { + up(1), -- ready to pass packets + down(2), + testing(3) -- in some test mode + } + + Definition: + The desired state of the interface. The testing(3) state + indicates that no operational packets can be passed. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ifOperStatus { ifEntry 8 } + + Syntax: + INTEGER { + up(1), -- ready to pass packets + down(2), + testing(3) -- in some test mode + } + + Definition: + The current operational state of the interface. The + testing(3) state indicates that no operational packets + can be passed. + + + +McCloghrie & Rose [Page 15] + +RFC 1066 MIB August 1988 + + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifLastChange { ifEntry 9 } + + Syntax: + TimeTicks + + Definition: + The value of sysUpTime at the time the interface entered + its current operational state. If the current state was + entered prior to the last re-initialization of the local + network management subsystem, then this object contains a + zero value. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifInOctets { ifEntry 10 } + + Syntax: + Counter + + Definition: + The total number of octets received on the interface, + including framing characters. + + Access: + read-only. + + Status: + mandatory. + + + + + + + +McCloghrie & Rose [Page 16] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ifInUcastPkts { ifEntry 11 } + + Syntax: + Counter + + Definition: + The number of (subnet) unicast packets delivered to a + higher-layer protocol. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifInNUcastPkts { ifEntry 12 } + + Syntax: + Counter + + Definition: + The number of non-unicast (i.e., subnet broadcast or + subnet multicast) packets delivered to a higher-layer + protocol. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifInDiscards { ifEntry 13 } + + Syntax: + Counter + + Definition: + The number of inbound packets which were chosen to be + discarded even though no errors had been detected to + prevent their being deliverable to a higher-layer + + + +McCloghrie & Rose [Page 17] + +RFC 1066 MIB August 1988 + + + protocol. One possible reason for discarding such a + packet could be to free up buffer space. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifInErrors { ifEntry 14 } + + Syntax: + Counter + + Definition: + The number of inbound packets that contained errors + preventing them from being deliverable to a higher-layer + protocol. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifInUnknownProtos { ifEntry 15 } + + Syntax: + Counter + + Definition: + The number of packets received via the interface which + were discarded because of an unknown or unsupported + protocol. + + Access: + read-only. + + Status: + mandatory. + + + + + +McCloghrie & Rose [Page 18] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ifOutOctets { ifEntry 16 } + + Syntax: + Counter + + Definition: + The total number of octets transmitted out of the + interface, including framing characters. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifOutUcastPkts { ifEntry 17 } + + Syntax: + Counter + + Definition: + The total number of packets that higher-level protocols + requested be transmitted to a subnet-unicast address, + including those that were discarded or not sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifOutNUcastPkts { ifEntry 18 } + + Syntax: + Counter + + Definition: + The total number of packets that higher-level protocols + requested be transmitted to a non-unicast (i.e., a subnet + broadcast or subnet multicast) address, including those + + + +McCloghrie & Rose [Page 19] + +RFC 1066 MIB August 1988 + + + that were discarded or not sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifOutDiscards { ifEntry 19 } + + Syntax: + Counter + + Definition: + The number of outbound packets which were chosen to be + discarded even though no errors had been detected to + prevent their being transmitted. One possible reason for + discarding such a packet could be to free up buffer + space. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ifOutErrors { ifEntry 20 } + + Syntax: + Counter + + Definition: + The number of outbound packets that could not be + transmitted because of errors. + + Access: + read-only. + + Status: + mandatory. + + + + + +McCloghrie & Rose [Page 20] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ifOutQLen { ifEntry 21 } + + Syntax: + Gauge + + Definition: + The length of the output packet queue (in packets). + + Access: + read-only. + + Status: + mandatory. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 21] + +RFC 1066 MIB August 1988 + + +5.3. The Address Translation Group + + Implementation of the Address Translation group is mandatory + for all systems. + + The Address Translation group contains one table which is the + union across all interfaces of the translation tables for + converting a NetworkAddress (e.g., an IP address) into a + subnetwork-specific address. For lack of a better term, this + document refers to such a subnetwork-specific address as a + "physical" address. + + Examples of such translation tables are: for broadcast media + where ARP is in use, the translation table is equivalent to + the ARP cache; or, on an X.25 network where non-algorithmic + translation to X.121 addresses is required, the translation + table contains the NetworkAddress to X.121 address + equivalences. + + OBJECT: + ------- + atTable { at 1 } + + Syntax: + SEQUENCE OF AtEntry + + Definition: + The Address Translation tables contain the NetworkAddress + to "physical" address equivalences. Some interfaces do + not use translation tables for determining address + equivalences (e.g., DDN-X.25 has an algorithmic method); + if all interfaces are of this type, then the Address + Translation table is empty, i.e., has zero entries. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + atEntry { atTable 1 } + + Syntax: + AtEntry ::= SEQUENCE { + atIfIndex + + + +McCloghrie & Rose [Page 22] + +RFC 1066 MIB August 1988 + + + INTEGER, + atPhysAddress + OCTET STRING, + atNetAddress + NetworkAddress + } + + Definition: + Each entry contains one NetworkAddress to "physical" + address equivalence. + + Access: + read-write. + + Status: + mandatory. + + We now consider the individual components of each Address + Translation table entry: + + + OBJECT: + ------- + atIfIndex { atEntry 1 } + + Syntax: + INTEGER + + Definition: + The interface on which this entry's equivalence is + effective. The interface identified by a particular + value of this index is the same interface as identified + by the same value of ifIndex. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + atPhysAddress { atEntry 2 } + + Syntax: + OCTET STRING + + + + +McCloghrie & Rose [Page 23] + +RFC 1066 MIB August 1988 + + + Definition: + The media-dependent "physical" address. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + atNetAddress { atEntry 3 } + + Syntax: + NetworkAddress + + Definition: + The NetworkAddress (e.g., the IP address) corresponding to + the media-dependent "physical" address. + + Access: + read-write. + + Status: + mandatory. + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 24] + +RFC 1066 MIB August 1988 + + +5.4. The IP Group + + Implementation of the IP group is mandatory for all systems. + + + OBJECT: + ------- + ipForwarding { ip 1 } + + Syntax: + INTEGER { + gateway(1), -- entity forwards datagrams + host(2) -- entity does NOT forward datagrams + } + + Definition: + The indication of whether this entity is acting as an IP + gateway in respect to the forwarding of datagrams + received by, but not addressed to, this entity. IP + gateways forward datagrams; Hosts do not (except those + Source-Routed via the host). + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipDefaultTTL { ip 2 } + + Syntax: + INTEGER + + Definition: + The default value inserted into the Time-To-Live field of + the IP header of datagrams originated at this entity, + whenever a TTL value is not supplied by the transport + layer protocol. + + Access: + read-write. + + Status: + mandatory. + + + + +McCloghrie & Rose [Page 25] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ipInReceives { ip 3 } + + Syntax: + Counter + + Definition: + The total number of input datagrams received from + interfaces, including those received in error. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipInHdrErrors { ip 4 } + + Syntax: + Counter + + Definition: + The number of input datagrams discarded due to errors in + their IP headers, including bad checksums, version number + mismatch, other format errors, time-to-live exceeded, + errors discovered in processing their IP options, etc. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipInAddrErrors { ip 5 } + + Syntax: + Counter + + Definition: + The number of input datagrams discarded because the IP + address in their IP header's destination field was not a + + + +McCloghrie & Rose [Page 26] + +RFC 1066 MIB August 1988 + + + valid address to be received at this entity. This count + includes invalid addresses (e.g., 0.0.0.0) and addresses + of unsupported Classes (e.g., Class E). For entities + which are not IP Gateways and therefore do not forward + datagrams, this counter includes datagrams discarded + because the destination address was not a local address. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipForwDatagrams { ip 6 } + + Syntax: + Counter + + Definition: + The number of input datagrams for which this entity was + not their final IP destination, as a result of which an + attempt was made to find a route to forward them to that + final destination. In entities which do not act as IP + Gateways, this counter will include only those packets + which were Source-Routed via this entity, and the + Source-Route option processing was successful. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipInUnknownProtos { ip 7 } + + Syntax: + Counter + + Definition: + The number of locally-addressed datagrams received + successfully but discarded because of an unknown or + unsupported protocol. + + + +McCloghrie & Rose [Page 27] + +RFC 1066 MIB August 1988 + + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipInDiscards { ip 8 } + + Syntax: + Counter + + Definition: + The number of input IP datagrams for which no problems + were encountered to prevent their continued processing, + but which were discarded (e.g. for lack of buffer space). + Note that this counter does not include any datagrams + discarded while awaiting re-assembly. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipInDelivers { ip 9 } + + Syntax: + Counter + + Definition: + The total number of input datagrams successfully + delivered to IP user-protocols (including ICMP). + + Access: + read-only. + + Status: + mandatory. + + OBJECT: + ------- + ipOutRequests { ip 10 } + + + +McCloghrie & Rose [Page 28] + +RFC 1066 MIB August 1988 + + + Syntax: + Counter + + Definition: + The total number of IP datagrams which local IP user- + protocols (including ICMP) supplied to IP in requests for + transmission. Note that this counter does not include + any datagrams counted in ipForwDatagrams. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipOutDiscards { ip 11 } + + Syntax: + Counter + + Definition: + The number of output IP datagrams for which no problem + was encountered to prevent their transmission to their + destination, but which were discarded (e.g., for lack of + buffer space). Note that this counter would include + datagrams counted in ipForwDatagrams if any such packets + met this (discretionary) discard criterion. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipOutNoRoutes { ip 12 } + + Syntax: + Counter + + + + + + + +McCloghrie & Rose [Page 29] + +RFC 1066 MIB August 1988 + + + Definition: + The number of IP datagrams discarded because no route + could be found to transmit them to their destination. + Note that this counter includes any packets counted in + ipForwDatagrams which meet this "no-route" criterion. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipReasmTimeout { ip 13 } + + Syntax: + INTEGER + + Definition: + The maximum number of seconds which received fragments + are held while they are awaiting reassembly at this + entity. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipReasmReqds { ip 14 } + + Syntax: + Counter + + Definition: + The number of IP fragments received which needed to be + reassembled at this entity. + + Access: + read-only. + + Status: + mandatory. + + + +McCloghrie & Rose [Page 30] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ipReasmOKs { ip 15 } + + Syntax: + Counter + + Definition: + The number of IP datagrams successfully re-assembled. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipReasmFails { ip 16 } + + Syntax: + Counter + + Definition: + The number of failures detected by the IP re-assembly + algorithm (for whatever reason: timed out, errors, etc). + + Note that this is not necessarily a count of discarded IP + fragments since some algorithms (notably RFC 815's) can + lose track of the number of fragments by combining them + as they are received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipFragOKs { ip 17 } + + Syntax: + Counter + + + + + +McCloghrie & Rose [Page 31] + +RFC 1066 MIB August 1988 + + + Definition: + The number of IP datagrams that have been successfully + fragmented at this entity. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipFragFails { ip 18 } + + Syntax: + Counter + + Definition: + The number of IP datagrams that have been discarded + because they needed to be fragmented at this entity but + could not be, e.g., because their "Don't Fragment" flag + was set. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipFragCreates { ip 19 } + + Syntax: + Counter + + Definition: + The number of IP datagram fragments that have been + generated as a result of fragmentation at this entity. + + Access: + read-only. + + Status: + mandatory. + + + + +McCloghrie & Rose [Page 32] + +RFC 1066 MIB August 1988 + + +5.4.1. The IP Address Table + + The Ip Address table contains this entity's IP addressing + information. + + + OBJECT: + ------- + ipAddrTable { ip 20 } + + Syntax: + SEQUENCE OF IpAddrEntry + + Definition: + The table of addressing information relevant to this + entity's IP addresses. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipAddrEntry { ipAddrTable 1 } + + Syntax: + IpAddrEntry ::= SEQUENCE { + ipAdEntAddr + IpAddress, + ipAdEntIfIndex + INTEGER, + ipAdEntNetMask + IpAddress, + ipAdEntBcastAddr + INTEGER + } + + Definition: + The addressing information for one of this entity's IP + addresses. + + Access: + read-only. + + + + + +McCloghrie & Rose [Page 33] + +RFC 1066 MIB August 1988 + + + Status: + mandatory. + + + OBJECT: + ------- + ipAdEntAddr { ipAddrEntry 1 } + + Syntax: + IpAddress + + Definition: + The IP address to which this entry's addressing + information pertains. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipAdEntIfIndex { ipAddrEntry 2 } + + Syntax: + INTEGER + + Definition: + The index value which uniquely identifies the interface + to which this entry is applicable. The interface + identified by a particular value of this index is the + same interface as identified by the same value of + ifIndex. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipAdEntNetMask { ipAddrEntry 3 } + + + + + +McCloghrie & Rose [Page 34] + +RFC 1066 MIB August 1988 + + + Syntax: + IpAddress + + Definition: + The subnet mask associated with the IP address of this + entry. The value of the mask is an IP address with all + the network bits set to 1 and all the hosts bits set to + 0. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipAdEntBcastAddr { ipAddrEntry 4 } + + Syntax: + INTEGER + + Definition: + The value of the least-significant bit in the IP + broadcast address used for sending datagrams on the + (logical) interface associated with the IP address of + this entry. For example, when the Internet standard + all-ones broadcast address is used, the value will be 1. + + Access: + read-only. + + Status: + mandatory. + +5.4.2. The IP Routing Table + + The IP Routing Table contains an entry for each route + presently known to this entity. Note that the action to be + taken in response to a request to read a non-existent entry, + is specific to the network management protocol being used. + + + OBJECT: + ------- + ipRoutingTable { ip 21 } + + + + +McCloghrie & Rose [Page 35] + +RFC 1066 MIB August 1988 + + + Syntax: + SEQUENCE OF IpRouteEntry + + Definition: + This entity's IP Routing table. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteEntry { ipRoutingTable 1 } + + Syntax: + IpRouteEntry ::= SEQUENCE { + ipRouteDest + IpAddress, + ipRouteIfIndex + INTEGER, + ipRouteMetric1 + INTEGER, + ipRouteMetric2 + INTEGER, + ipRouteMetric3 + INTEGER, + ipRouteMetric4 + INTEGER, + ipRouteNextHop + IpAddress, + ipRouteType + INTEGER, + ipRouteProto + INTEGER, + ipRouteAge + INTEGER + } + + Definition: + A route to a particular destination. + + Access: + read-write. + + + + + +McCloghrie & Rose [Page 36] + +RFC 1066 MIB August 1988 + + + Status: + mandatory. + + We now consider the individual components of each route in the + IP Routing Table: + + + OBJECT: + ------- + ipRouteDest { ipRouteEntry 1 } + + Syntax: + IpAddress + + Definition: + The destination IP address of this route. An entry with + a value of 0.0.0.0 is considered a default route. + Multiple such default routes can appear in the table, but + access to such multiple entries is dependent on the + table-access mechanisms defined by the network management + protocol in use. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteIfIndex { ipRouteEntry 2 } + + Syntax: + INTEGER + + Definition: + The index value which uniquely identifies the local + interface through which the next hop of this route should + be reached. The interface identified by a particular + value of this index is the same interface as identified + by the same value of ifIndex. + + Access: + read-write. + + Status: + mandatory. + + + +McCloghrie & Rose [Page 37] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ipRouteMetric1 { ipRouteEntry 3 } + + Syntax: + INTEGER + + Definition: + The primary routing metric for this route. The semantics + of this metric are determined by the routing-protocol + specified in the route's ipRouteProto value. If this + metric is not used, its value should be set to -1. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteMetric2 { ipRouteEntry 4 } + + Syntax: + INTEGER + + Definition: + An alternate routing metric for this route. The + semantics of this metric are determined by the routing- + protocol specified in the route's ipRouteProto value. If + this metric is not used, its value should be set to -1. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteMetric3 { ipRouteEntry 5 } + + Syntax: + INTEGER + + + + + +McCloghrie & Rose [Page 38] + +RFC 1066 MIB August 1988 + + + Definition: + An alternate routing metric for this route. The + semantics of this metric are determined by the routing- + protocol specified in the route's ipRouteProto value. If + this metric is not used, its value should be set to -1. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteMetric4 { ipRouteEntry 6 } + + Syntax: + INTEGER + + Definition: + An alternate routing metric for this route. The + semantics of this metric are determined by the routing- + protocol specified in the route's ipRouteProto value. If + this metric is not used, its value should be set to -1. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteNextHop { ipRouteEntry 7 } + + Syntax: + IpAddress + + Definition: + The IP address of the next hop of this route. + + Access: + read-write. + + Status: + mandatory. + + + +McCloghrie & Rose [Page 39] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + ipRouteType { ipRouteEntry 8 } + + Syntax: + INTEGER { + other(1), -- none of the following + + invalid(2), -- an invalidated route + + -- route to directly + direct(3), -- connected (sub-)network + + -- route to a non-local + remote(4), -- host/network/sub-network + } + + Definition: + The type of route. + + Access: + read-write. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteProto { ipRouteEntry 9 } + + Syntax: + INTEGER { + other(1), -- none of the following + + -- non-protocol information, + -- e.g., manually configured + local(2), -- entries + + -- set via a network management + netmgmt(3), -- protocol + + -- obtained via ICMP, + icmp(4), -- e.g., Redirect + + -- the remaining values are + -- all gateway routing protocols + egp(5), + + + +McCloghrie & Rose [Page 40] + +RFC 1066 MIB August 1988 + + + ggp(6), + hello(7), + rip(8), + is-is(9), + es-is(10), + ciscoIgrp(11), + bbnSpfIgp(12), + oigp(13) + } + + Definition: + The routing mechanism via which this route was learned. + Inclusion of values for gateway routing protocols is not + intended to imply that hosts should support those + protocols. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + ipRouteAge { ipRouteEntry 10 } + + Syntax: + INTEGER + + Definition: + The number of seconds since this route was last updated + or otherwise determined to be correct. Note that no + semantics of "too old" can be implied except through + knowledge of the routing protocol by which the route was + learned. + + Access: + read-write. + + Status: + mandatory. + + + + + + + + + +McCloghrie & Rose [Page 41] + +RFC 1066 MIB August 1988 + + +5.5. The ICMP Group + + Implementation of the ICMP group is mandatory for all systems. + + The ICMP group contains the ICMP input and output statistics. + + Note that individual counters for ICMP message (sub-)codes have been + omitted from this (version of the) MIB for simplicity. + + + OBJECT: + ------- + icmpInMsgs { icmp 1 } + + Syntax: + Counter + + Definition: + The total number of ICMP messages which the entity + received. Note that this counter includes all those + counted by icmpInErrors. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInErrors { icmp 2 } + + Syntax: + Counter + + Definition: + The number of ICMP messages which the entity received but + determined as having errors (bad ICMP checksums, bad + length, etc.). + + Access: + read-only. + + Status: + mandatory. + + + + + +McCloghrie & Rose [Page 42] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + icmpInDestUnreachs { icmp 3 } + + Syntax: + Counter + + Definition: + The number of ICMP Destination Unreachable messages + received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInTimeExcds { icmp 4 } + + Syntax: + Counter + + Definition: + The number of ICMP Time Exceeded messages received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInParmProbs { icmp 5 } + + Syntax: + Counter + + Definition: + The number of ICMP Parameter Problem messages received. + + Access: + read-only. + + + + +McCloghrie & Rose [Page 43] + +RFC 1066 MIB August 1988 + + + Status: + mandatory. + + + OBJECT: + ------- + icmpInSrcQuenchs { icmp 6 } + + Syntax: + Counter + + Definition: + The number of ICMP Source Quench messages received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInRedirects { icmp 7 } + + Syntax: + Counter + + Definition: + The number of ICMP Redirect messages received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInEchos { icmp 8 } + + Syntax: + Counter + + Definition: + The number of ICMP Echo (request) messages received. + + + + +McCloghrie & Rose [Page 44] + +RFC 1066 MIB August 1988 + + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInEchoReps { icmp 9 } + + Syntax: + Counter + + Definition: + The number of ICMP Echo Reply messages received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInTimestamps { icmp 10 } + + Syntax: + Counter + + Definition: + The number of ICMP Timestamp (request) messages received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInTimestampReps { icmp 11 } + + Syntax: + Counter + + + + +McCloghrie & Rose [Page 45] + +RFC 1066 MIB August 1988 + + + Definition: + The number of ICMP Timestamp Reply messages received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInAddrMasks { icmp 12 } + + Syntax: + Counter + + Definition: + The number of ICMP Address Mask Request messages + received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpInAddrMaskReps { icmp 13 } + + Syntax: + Counter + + Definition: + The number of ICMP Address Mask Reply messages received. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutMsgs { icmp 14 } + + + +McCloghrie & Rose [Page 46] + +RFC 1066 MIB August 1988 + + + Syntax: + Counter + + Definition: + The total number of ICMP messages which this entity + attempted to send. Note that this counter includes all + those counted by icmpOutErrors. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutErrors { icmp 15 } + + Syntax: + Counter + + Definition: + The number of ICMP messages which this entity did not + send due to problems discovered within ICMP such as a + lack of buffers. This value should not include errors + discovered outside the ICMP layer such as the inability + of IP to route the resultant datagram. In some + implementations there may be no types of error which + contribute to this counter's value. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutDestUnreachs { icmp 16 } + + Syntax: + Counter + + Definition: + The number of ICMP Destination Unreachable messages sent. + + + + +McCloghrie & Rose [Page 47] + +RFC 1066 MIB August 1988 + + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutTimeExcds { icmp 17 } + + Syntax: + Counter + + Definition: + The number of ICMP Time Exceeded messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutParmProbs { icmp 18 } + + Syntax: + Counter + + Definition: + The number of ICMP Parameter Problem messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutSrcQuenchs { icmp 19 } + + Syntax: + Counter + + + + +McCloghrie & Rose [Page 48] + +RFC 1066 MIB August 1988 + + + Definition: + The number of ICMP Source Quench messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutRedirects { icmp 20 } + + Syntax: + Counter + + Definition: + The number of ICMP Redirect messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutEchos { icmp 21 } + + Syntax: + Counter + + Definition: + The number of ICMP Echo (request) messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutEchoReps { icmp 22 } + + + + +McCloghrie & Rose [Page 49] + +RFC 1066 MIB August 1988 + + + Syntax: + Counter + + Definition: + The number of ICMP Echo Reply messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutTimestamps { icmp 23 } + + Syntax: + Counter + + Definition: + The number of ICMP Timestamp (request) messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutTimestampReps { icmp 24 } + + Syntax: + Counter + + Definition: + The number of ICMP Timestamp Reply messages sent. + + Access: + read-only. + + Status: + mandatory. + + + + + + +McCloghrie & Rose [Page 50] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + icmpOutAddrMasks { icmp 25 } + + Syntax: + Counter + + Definition: + The number of ICMP Address Mask Request messages sent. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + icmpOutAddrMaskReps { icmp 26 } + + Syntax: + Counter + + Definition: + The number of ICMP Address Mask Reply messages sent. + + Access: + read-only. + + Status: + mandatory. + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 51] + +RFC 1066 MIB August 1988 + + +5.6. The TCP Group + + Implementation of the TCP group is mandatory for all systems + that implement the TCP protocol. + + Note that instances of object types that represent information + about a particular TCP connection are transient; they persist + only as long as the connection in question. + + OBJECT: + ------- + tcpRtoAlgorithm { tcp 1 } + + Syntax: + INTEGER { + other(1), -- none of the following + constant(2), -- a constant rto + rsre(3), -- MIL-STD-1778, Appendix B + vanj(4) -- Van Jacobson's algorithm [11] + } + + Definition: + The algorithm used to determine the timeout value used + for retransmitting unacknowledged octets. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpRtoMin { tcp 2 } + + Syntax: + INTEGER + + Definition: + The minimum value permitted by a TCP implementation + for the retransmission timeout, measured in + milliseconds. More refined semantics for objects + of this type depend upon the algorithm used to + determine the retransmission timeout. In particular, + when the timeout algorithm is rsre(3), an object + of this type has the semantics of the LBOUND + quantity described in RFC 793. + + + +McCloghrie & Rose [Page 52] + +RFC 1066 MIB August 1988 + + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpRtoMax { tcp 3 } + + Syntax: + INTEGER + + Definition: + The maximum value permitted by a TCP implementation + for the retransmission timeout, measured + in milliseconds. More refined semantics for objects + of this type depend upon the algorithm used to + determine the retransmission timeout. In particular, + when the timeout algorithm is rsre(3), an object of + this type has the semantics of the UBOUND quantity + described in RFC 793. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpMaxConn { tcp 4 } + + Syntax: + INTEGER + + Definition: + The limit on the total number of TCP connections the + entity can support. In entities where the maximum + number of connections is dynamic, this object should + contain the value "-1". + + Access: + read-only. + + + + + +McCloghrie & Rose [Page 53] + +RFC 1066 MIB August 1988 + + + Status: + mandatory. + + + OBJECT: + ------- + tcpActiveOpens { tcp 5 } + + Syntax: + Counter + + Definition: + The number of times TCP connections have made a direct + transition to the SYN-SENT state from the CLOSED + state. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpPassiveOpens { tcp 6 } + + Syntax: + Counter + + Definition: + The number of times TCP connections have made a direct + transition to the SYN-RCVD state from the LISTEN + state. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpAttemptFails { tcp 7 } + + Syntax: + Counter + + + +McCloghrie & Rose [Page 54] + +RFC 1066 MIB August 1988 + + + Definition: + The number of times TCP connections have made a direct + transition to the CLOSED state from either the + SYN-SENT state or the SYN-RCVD state, plus the number + of times TCP connections have made a direct transition + to the LISTEN state from the SYN-RCVD state. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpEstabResets { tcp 8 } + + Syntax: + Counter + + Definition: + The number of times TCP connections have made a direct + transition to the CLOSED state from either the + ESTABLISHED state or the CLOSE-WAIT state. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpCurrEstab { tcp 9 } + + Syntax: + Gauge + + Definition: + The number of TCP connections for which the current + state is either ESTABLISHED or CLOSE-WAIT. + + Access: + read-only. + + + + + +McCloghrie & Rose [Page 55] + +RFC 1066 MIB August 1988 + + + Status: + mandatory. + + + OBJECT: + ------- + tcpInSegs { tcp 10 } + + Syntax: + Counter + + Definition: + The total number of segments received, including those + received in error. This count includes segments + received on currently established connections. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpOutSegs { tcp 11 } + + Syntax: + Counter + + Definition: + The total number of segments sent, including those on + current connections but excluding those containing + only retransmitted octets. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpRetransSegs { tcp 12 } + + Syntax: + Counter + + + +McCloghrie & Rose [Page 56] + +RFC 1066 MIB August 1988 + + + Definition: + The total number of segments retransmitted - that is, + the number of TCP segments transmitted containing one + or more previously transmitted octets. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpConnTable { tcp 13 } + + Syntax: + SEQUENCE OF TcpConnEntry + + Definition: + A table containing TCP connection-specific + information. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpConnEntry { tcpConnTable 1 } + + Syntax: + TcpConnEntry ::= SEQUENCE { + tcpConnState + INTEGER, + tcpConnLocalAddress + IpAddress, + tcpConnLocalPort + INTEGER (0..65535), + tcpConnRemAddress + IpAddress, + tcpConnRemPort + INTEGER (0..65535) + } + + + + +McCloghrie & Rose [Page 57] + +RFC 1066 MIB August 1988 + + + Definition: + Information about a particular current TCP connection. + An object of this type is transient, in that it ceases + to exist when (or soon after) the connection makes the + transition to the CLOSED state. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpConnState { tcpConnEntry 1 } + + Syntax: + INTEGER { + closed(1), + listen(2), + synSent(3), + synReceived(4), + established(5), + finWait1(6), + finWait2(7), + closeWait(8), + lastAck(9), + closing(10), + timeWait(11) + } + + Definition: + The state of this TCP connection. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpConnLocalAddress { tcpConnEntry 2 } + + Syntax: + IpAddress + + + +McCloghrie & Rose [Page 58] + +RFC 1066 MIB August 1988 + + + Definition: + The local IP address for this TCP connection. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpConnLocalPort { tcpConnEntry 3 } + + Syntax: + INTEGER (0..65535) + + Definition: + The local port number for this TCP connection. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpConnRemAddress { tcpConnEntry 4 } + + Syntax: + IpAddress + + Definition: + The remote IP address for this TCP connection. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + tcpConnRemPort { tcpConnEntry 5 } + + + + +McCloghrie & Rose [Page 59] + +RFC 1066 MIB August 1988 + + + Syntax: + INTEGER (0..65535) + + Definition: + The remote port number for this TCP connection. + + Access: + read-only. + + Status: + mandatory. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 60] + +RFC 1066 MIB August 1988 + + +5.7. The UDP Group + + Implementation of the UDP group is mandatory for all systems + which implement the UDP protocol. + + OBJECT: + ------- + udpInDatagrams { udp 1 } + + Syntax: + Counter + + Definition: + The total number of UDP datagrams delivered to UDP + users. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + udpNoPorts { udp 2 } + + Syntax: + Counter + + Definition: + The total number of received UDP datagrams for which + there was no application at the destination port. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + udpInErrors { udp 3 } + + Syntax: + Counter + + + + +McCloghrie & Rose [Page 61] + +RFC 1066 MIB August 1988 + + + Definition: + The number of received UDP datagrams that could not be + delivered for reasons other than the lack of an + application at the destination port. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + udpOutDatagrams { udp 4 } + + Syntax: + Counter + + Definition: + The total number of UDP datagrams sent from this + entity. + + Access: + read-only. + + Status: + mandatory. + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 62] + +RFC 1066 MIB August 1988 + + +5.8. The EGP Group + + Implementation of the EGP group is mandatory for all systems + which implement the EGP protocol. + + OBJECT: + ------- + egpInMsgs { egp 1 } + + Syntax: + Counter + + Definition: + The number of EGP messages received without error. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + egpInErrors { egp 2 } + + Syntax: + Counter + + Definition: + The number of EGP messages received that proved to be + in error. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + egpOutMsgs { egp 3 } + + Syntax: + Counter + + + + + +McCloghrie & Rose [Page 63] + +RFC 1066 MIB August 1988 + + + Definition: + The total number of locally generated EGP messages. + + Access: + read-only. + + Status: + mandatory. + + + OBJECT: + ------- + egpOutErrors { egp 4 } + + Syntax: + Counter + + Definition: + The number of locally generated EGP messages not sent + due to resource limitations within an EGP entity. + + Access: + read-only. + + Status: + mandatory. + +5.8.1. The EGP Neighbor Table + + The Egp Neighbor table contains information about this entity's EGP + neighbors. + + + OBJECT: + ------- + egpNeighTable { egp 5 } + + Syntax: + SEQUENCE OF EgpNeighEntry + + Definition: + The EGP neighbor table. + + Access: + read-only. + + Status: + mandatory. + + + +McCloghrie & Rose [Page 64] + +RFC 1066 MIB August 1988 + + + OBJECT: + ------- + egpNeighEntry { egpNeighTable 1 } + + Syntax: + EgpNeighEntry ::= SEQUENCE { + egpNeighState + INTEGER, + egpNeighAddr + IpAddress + } + + Definition: + Information about this entity's relationship with a + particular EGP neighbor. + + Access: + read-only. + + Status: + mandatory. + + + We now consider the individual components of each EGP + neighbor entry: + + + OBJECT: + ------- + egpNeighState { egpNeighEntry 1 } + + Syntax: + INTEGER { + idle(1), + acquisition(2), + down(3), + up(4), + cease(5) + } + + Definition: + The EGP state of the local system with respect to this + entry's EGP neighbor. Each EGP state is represented + by a value that is one greater than the numerical + value associated with said state in RFC 904. + + Access: + read-only. + + + +McCloghrie & Rose [Page 65] + +RFC 1066 MIB August 1988 + + + Status: + mandatory. + + + OBJECT: + ------- + egpNeighAddr { egpNeighEntry 2 } + + Syntax: + IpAddress + + Definition: + The IP address of this entry's EGP neighbor. + + Access: + read-only. + + Status: + mandatory. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 66] + +RFC 1066 MIB August 1988 + + +6. Definitions + + RFC1066-MIB { iso org(3) dod(6) internet(1) mgmt(2) 1 } + + DEFINITIONS ::= BEGIN + + IMPORTS + mgmt, OBJECT-TYPE, NetworkAddress, IpAddress, + Counter, Gauge, TimeTicks + FROM RFC1065-SMI; + + mib OBJECT IDENTIFIER ::= { mgmt 1 } + + system OBJECT IDENTIFIER ::= { mib 1 } + interfaces OBJECT IDENTIFIER ::= { mib 2 } + at OBJECT IDENTIFIER ::= { mib 3 } + ip OBJECT IDENTIFIER ::= { mib 4 } + icmp OBJECT IDENTIFIER ::= { mib 5 } + tcp OBJECT IDENTIFIER ::= { mib 6 } + udp OBJECT IDENTIFIER ::= { mib 7 } + egp OBJECT IDENTIFIER ::= { mib 8 } + + -- object types + + -- the System group + + sysDescr OBJECT-TYPE + SYNTAX OCTET STRING + ACCESS read-only + STATUS mandatory + ::= { system 1 } + + sysObjectID OBJECT-TYPE + SYNTAX OBJECT IDENTIFIER + ACCESS read-only + STATUS mandatory + ::= { system 2 } + + sysUpTime OBJECT-TYPE + SYNTAX TimeTicks + ACCESS read-only + STATUS mandatory + ::= { system 3 } + + -- the Interfaces group + + ifNumber OBJECT-TYPE + SYNTAX INTEGER + + + +McCloghrie & Rose [Page 67] + +RFC 1066 MIB August 1988 + + + ACCESS read-only + STATUS mandatory + ::= { interfaces 1 } + + -- the Interfaces table + + ifTable OBJECT-TYPE + SYNTAX SEQUENCE OF IfEntry + ACCESS read-write + STATUS mandatory + ::= { interfaces 2 } + + ifEntry OBJECT-TYPE + SYNTAX IfEntry + ACCESS read-write + STATUS mandatory + ::= { ifTable 1 } + + IfEntry ::= SEQUENCE { + ifIndex + INTEGER, + ifDescr + OCTET STRING, + ifType + INTEGER, + ifMtu + INTEGER, + ifSpeed + Gauge, + ifPhysAddress + OCTET STRING, + ifAdminStatus + INTEGER, + ifOperStatus + INTEGER, + ifLastChange + TimeTicks, + ifInOctets + Counter, + ifInUcastPkts + Counter, + ifInNUcastPkts + Counter, + ifInDiscards + Counter, + ifInErrors + Counter, + ifInUnknownProtos + + + +McCloghrie & Rose [Page 68] + +RFC 1066 MIB August 1988 + + + Counter, + ifOutOctets + Counter, + ifOutUcastPkts + Counter, + ifOutNUcastPkts + Counter, + ifOutDiscards + Counter, + ifOutErrors + Counter, + ifOutQLen + Gauge + } + + ifIndex OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { ifEntry 1 } + + ifDescr OBJECT-TYPE + SYNTAX OCTET STRING + ACCESS read-only + STATUS mandatory + ::= { ifEntry 2 } + + ifType OBJECT-TYPE + SYNTAX INTEGER { + other(1), -- none of the following + regular1822(2), + hdh1822(3), + ddn-x25(4), + rfc877-x25(5), + ethernet-csmacd(6), + iso88023-csmacd(7), + iso88024-tokenBus(8), + iso88025-tokenRing(9), + iso88026-man(10), + starLan(11), + proteon-10MBit(12), + proteon-80MBit(13), + hyperchannel(14), + fddi(15), + lapb(16), + sdlc(17), + t1-carrier(18), + cept(19), + + + +McCloghrie & Rose [Page 69] + +RFC 1066 MIB August 1988 + + + basicIsdn(20), + primaryIsdn(21), + -- proprietary serial + propPointToPointSerial(22) + } + ACCESS read-only + STATUS mandatory + ::= { ifEntry 3 } + + ifMtu OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { ifEntry 4 } + + ifSpeed OBJECT-TYPE + SYNTAX Gauge + ACCESS read-only + STATUS mandatory + ::= { ifEntry 5 } + + ifPhysAddress OBJECT-TYPE + SYNTAX OCTET STRING + ACCESS read-only + STATUS mandatory + ::= { ifEntry 6 } + + ifAdminStatus OBJECT-TYPE + SYNTAX INTEGER { + up(1), -- ready to pass packets + down(2), + testing(3) -- in some test mode + } + ACCESS read-write + STATUS mandatory + ::= { ifEntry 7 } + + ifOperStatus OBJECT-TYPE + SYNTAX INTEGER { + up(1), -- ready to pass packets + down(2), + testing(3) -- in some test mode + } + ACCESS read-only + STATUS mandatory + ::= { ifEntry 8 } + + ifLastChange OBJECT-TYPE + + + +McCloghrie & Rose [Page 70] + +RFC 1066 MIB August 1988 + + + SYNTAX TimeTicks + ACCESS read-only + STATUS mandatory + ::= { ifEntry 9 } + + ifInOctets OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 10 } + + ifInUcastPkts OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 11 } + + ifInNUcastPkts OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 12 } + + ifInDiscards OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 13 } + + ifInErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 14 } + + ifInUnknownProtos OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 15 } + + ifOutOctets OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 16 } + + ifOutUcastPkts OBJECT-TYPE + + + +McCloghrie & Rose [Page 71] + +RFC 1066 MIB August 1988 + + + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 17 } + + ifOutNUcastPkts OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 18 } + + ifOutDiscards OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 19 } + + ifOutErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ifEntry 20 } + + ifOutQLen OBJECT-TYPE + SYNTAX Gauge + ACCESS read-only + STATUS mandatory + ::= { ifEntry 21 } + + -- the Address Translation group + + atTable OBJECT-TYPE + SYNTAX SEQUENCE OF AtEntry + ACCESS read-write + STATUS mandatory + ::= { at 1 } + + atEntry OBJECT-TYPE + SYNTAX AtEntry + ACCESS read-write + STATUS mandatory + ::= { atTable 1 } + + AtEntry ::= SEQUENCE { + atIfIndex + INTEGER, + atPhysAddress + OCTET STRING, + + + +McCloghrie & Rose [Page 72] + +RFC 1066 MIB August 1988 + + + atNetAddress + NetworkAddress + } + + atIfIndex OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { atEntry 1 } + + atPhysAddress OBJECT-TYPE + SYNTAX OCTET STRING + ACCESS read-write + STATUS mandatory + ::= { atEntry 2 } + + atNetAddress OBJECT-TYPE + SYNTAX NetworkAddress + ACCESS read-write + STATUS mandatory + ::= { atEntry 3 } + + -- the IP group + + ipForwarding OBJECT-TYPE + SYNTAX INTEGER { + gateway(1), -- entity forwards datagrams + host(2) -- entity does NOT forward datagrams + } + ACCESS read-only + STATUS mandatory + ::= { ip 1 } + + ipDefaultTTL OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { ip 2 } + + ipInReceives OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 3 } + + ipInHdrErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + + + +McCloghrie & Rose [Page 73] + +RFC 1066 MIB August 1988 + + + STATUS mandatory + ::= { ip 4 } + + ipInAddrErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 5 } + + ipForwDatagrams OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 6 } + + ipInUnknownProtos OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 7 } + + ipInDiscards OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 8 } + + ipInDelivers OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 9 } + + ipOutRequests OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 10 } + + ipOutDiscards OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 11 } + + ipOutNoRoutes OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + + + +McCloghrie & Rose [Page 74] + +RFC 1066 MIB August 1988 + + + STATUS mandatory + ::= { ip 12 } + + ipReasmTimeout OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { ip 13 } + + ipReasmReqds OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 14 } + + ipReasmOKs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 15 } + + ipReasmFails OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 16 } + + ipFragOKs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 17 } + + ipFragFails OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 18 } + + ipFragCreates OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { ip 19 } + + -- the IP Interface table + + ipAddrTable OBJECT-TYPE + + + +McCloghrie & Rose [Page 75] + +RFC 1066 MIB August 1988 + + + SYNTAX SEQUENCE OF IpAddrEntry + ACCESS read-only + STATUS mandatory + ::= { ip 20 } + + ipAddrEntry OBJECT-TYPE + SYNTAX IpAddrEntry + ACCESS read-only + STATUS mandatory + ::= { ipAddrTable 1 } + + IpAddrEntry ::= SEQUENCE { + ipAdEntAddr + IpAddress, + ipAdEntIfIndex + INTEGER, + ipAdEntNetMask + IpAddress, + ipAdEntBcastAddr + INTEGER + } + + ipAdEntAddr OBJECT-TYPE + SYNTAX IpAddress + ACCESS read-only + STATUS mandatory + ::= { ipAddrEntry 1 } + + ipAdEntIfIndex OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { ipAddrEntry 2 } + + ipAdEntNetMask OBJECT-TYPE + SYNTAX IpAddress + ACCESS read-only + STATUS mandatory + ::= { ipAddrEntry 3 } + + ipAdEntBcastAddr OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { ipAddrEntry 4 } + + -- the IP Routing table + + + + +McCloghrie & Rose [Page 76] + +RFC 1066 MIB August 1988 + + + ipRoutingTable OBJECT-TYPE + SYNTAX SEQUENCE OF IpRouteEntry + ACCESS read-write + STATUS mandatory + ::= { ip 21 } + + ipRouteEntry OBJECT-TYPE + SYNTAX IpRouteEntry + ACCESS read-write + STATUS mandatory + ::= { ipRoutingTable 1 } + + IpRouteEntry ::= SEQUENCE { + ipRouteDest + IpAddress, + ipRouteIfIndex + INTEGER, + ipRouteMetric1 + INTEGER, + ipRouteMetric2 + INTEGER, + ipRouteMetric3 + INTEGER, + ipRouteMetric4 + INTEGER, + ipRouteNextHop + IpAddress, + ipRouteType + INTEGER, + ipRouteProto + INTEGER, + ipRouteAge + INTEGER + } + + ipRouteDest OBJECT-TYPE + SYNTAX IpAddress + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 1 } + + ipRouteIfIndex OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 2 } + + ipRouteMetric1 OBJECT-TYPE + + + +McCloghrie & Rose [Page 77] + +RFC 1066 MIB August 1988 + + + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 3 } + + ipRouteMetric2 OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 4 } + + ipRouteMetric3 OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 5 } + + ipRouteMetric4 OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 6 } + + ipRouteNextHop OBJECT-TYPE + SYNTAX IpAddress + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 7 } + + ipRouteType OBJECT-TYPE + SYNTAX INTEGER { + other(1), -- none of the following + + invalid(2), -- an invalidated route + + -- route to directly + direct(3), -- connected (sub-)network + + -- route to a non-local + remote(4), -- host/network/sub-network + } + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 8 } + + ipRouteProto OBJECT-TYPE + SYNTAX INTEGER { + other(1), -- none of the following + + + +McCloghrie & Rose [Page 78] + +RFC 1066 MIB August 1988 + + + -- non-protocol information + -- e.g., manually + local(2), -- configured entries + + -- set via a network + netmgmt(3), -- management protocol + + -- obtained via ICMP, + icmp(4), -- e.g., Redirect + + -- the following are + -- gateway routing protocols + egp(5), + ggp(6), + hello(7), + rip(8), + is-is(9), + es-is(10), + ciscoIgrp(11), + bbnSpfIgp(12), + oigp(13) + } + ACCESS read-only + STATUS mandatory + ::= { ipRouteEntry 9 } + + ipRouteAge OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-write + STATUS mandatory + ::= { ipRouteEntry 10 } + + -- the ICMP group + + icmpInMsgs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 1 } + + icmpInErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 2 } + + icmpInDestUnreachs OBJECT-TYPE + SYNTAX Counter + + + +McCloghrie & Rose [Page 79] + +RFC 1066 MIB August 1988 + + + ACCESS read-only + STATUS mandatory + ::= { icmp 3 } + + icmpInTimeExcds OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 4 } + + icmpInParmProbs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 5 } + + icmpInSrcQuenchs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 6 } + + icmpInRedirects OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 7 } + + icmpInEchos OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 8 } + + icmpInEchoReps OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 9 } + + icmpInTimestamps OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 10 } + + icmpInTimestampReps OBJECT-TYPE + SYNTAX Counter + + + +McCloghrie & Rose [Page 80] + +RFC 1066 MIB August 1988 + + + ACCESS read-only + STATUS mandatory + ::= { icmp 11 } + + icmpInAddrMasks OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 12 } + + icmpInAddrMaskReps OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 13 } + + icmpOutMsgs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 14 } + + icmpOutErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 15 } + + icmpOutDestUnreachs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 16 } + + icmpOutTimeExcds OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 17 } + + icmpOutParmProbs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 18 } + + icmpOutSrcQuenchs OBJECT-TYPE + SYNTAX Counter + + + +McCloghrie & Rose [Page 81] + +RFC 1066 MIB August 1988 + + + ACCESS read-only + STATUS mandatory + ::= { icmp 19 } + + icmpOutRedirects OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 20 } + + icmpOutEchos OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 21 } + + icmpOutEchoReps OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 22 } + + icmpOutTimestamps OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 23 } + + icmpOutTimestampReps OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 24 } + + icmpOutAddrMasks OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 25 } + + icmpOutAddrMaskReps OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { icmp 26 } + + -- the TCP group + + + + +McCloghrie & Rose [Page 82] + +RFC 1066 MIB August 1988 + + + tcpRtoAlgorithm OBJECT-TYPE + SYNTAX INTEGER { + other(1), -- none of the following + constant(2), -- a constant rto + rsre(3), -- MIL-STD-1778, Appendix B + vanj(4) -- Van Jacobson's algorithm [11] + } + ACCESS read-only + STATUS mandatory + ::= { tcp 1 } + + tcpRtoMin OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { tcp 2 } + + tcpRtoMax OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { tcp 3 } + + tcpMaxConn OBJECT-TYPE + SYNTAX INTEGER + ACCESS read-only + STATUS mandatory + ::= { tcp 4 } + + tcpActiveOpens OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { tcp 5 } + + tcpPassiveOpens OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { tcp 6 } + + tcpAttemptFails OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { tcp 7 } + + tcpEstabResets OBJECT-TYPE + + + +McCloghrie & Rose [Page 83] + +RFC 1066 MIB August 1988 + + + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { tcp 8 } + + tcpCurrEstab OBJECT-TYPE + SYNTAX Gauge + ACCESS read-only + STATUS mandatory + ::= { tcp 9 } + + tcpInSegs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { tcp 10 } + + tcpOutSegs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { tcp 11 } + + tcpRetransSegs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { tcp 12 } + + -- the TCP connections table + + tcpConnTable OBJECT-TYPE + SYNTAX SEQUENCE OF TcpConnEntry + ACCESS read-only + STATUS mandatory + ::= { tcp 13 } + + tcpConnEntry OBJECT-TYPE + SYNTAX TcpConnEntry + ACCESS read-only + STATUS mandatory + ::= { tcpConnTable 1 } + + TcpConnEntry ::= SEQUENCE { + tcpConnState + INTEGER, + tcpConnLocalAddress + IpAddress, + + + +McCloghrie & Rose [Page 84] + +RFC 1066 MIB August 1988 + + + tcpConnLocalPort + INTEGER (0..65535), + tcpConnRemAddress + IpAddress, + tcpConnRemPort + INTEGER (0..65535) + } + + tcpConnState OBJECT-TYPE + SYNTAX INTEGER { + closed(1), + listen(2), + synSent(3), + synReceived(4), + established(5), + finWait1(6), + finWait2(7), + closeWait(8), + lastAck(9), + closing(10), + timeWait(11) + } + ACCESS read-only + STATUS mandatory + ::= { tcpConnEntry 1 } + + tcpConnLocalAddress OBJECT-TYPE + SYNTAX IpAddress + ACCESS read-only + STATUS mandatory + ::= { tcpConnEntry 2 } + + tcpConnLocalPort OBJECT-TYPE + SYNTAX INTEGER (0..65535) + ACCESS read-only + STATUS mandatory + ::= { tcpConnEntry 3 } + + tcpConnRemAddress OBJECT-TYPE + SYNTAX IpAddress + ACCESS read-only + STATUS mandatory + ::= { tcpConnEntry 4 } + + tcpConnRemPort OBJECT-TYPE + SYNTAX INTEGER (0..65535) + ACCESS read-only + STATUS mandatory + + + +McCloghrie & Rose [Page 85] + +RFC 1066 MIB August 1988 + + + ::= { tcpConnEntry 5 } + + -- the UDP group + + udpInDatagrams OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { udp 1 } + + udpNoPorts OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { udp 2 } + + udpInErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { udp 3 } + + udpOutDatagrams OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { udp 4 } + + -- the EGP group + + egpInMsgs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { egp 1 } + + egpInErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { egp 2 } + + egpOutMsgs OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { egp 3 } + + + + +McCloghrie & Rose [Page 86] + +RFC 1066 MIB August 1988 + + + egpOutErrors OBJECT-TYPE + SYNTAX Counter + ACCESS read-only + STATUS mandatory + ::= { egp 4 } + + -- the EGP Neighbor table + + egpNeighTable OBJECT-TYPE + SYNTAX SEQUENCE OF EgpNeighEntry + ACCESS read-only + STATUS mandatory + ::= { egp 5 } + + egpNeighEntry OBJECT-TYPE + SYNTAX EgpNeighEntry + ACCESS read-only + STATUS mandatory + ::= { egpNeighTable 1 } + + EgpNeighEntry ::= SEQUENCE { + egpNeighState + INTEGER, + egpNeighAddr + IpAddress + } + + egpNeighState OBJECT-TYPE + SYNTAX INTEGER { + idle(1), + acquisition(2), + down(3), + up(4), + cease(5) + } + ACCESS read-only + STATUS mandatory + ::= { egpNeighEntry 1 } + + egpNeighAddr OBJECT-TYPE + SYNTAX IpAddress + ACCESS read-only + STATUS mandatory + ::= { egpNeighEntry 2 } + + END + + + + + +McCloghrie & Rose [Page 87] + +RFC 1066 MIB August 1988 + + +7. Acknowledgements + + The initial draft of this memo was heavily influenced by the the HEMS + [5] and SNMP [6] MIBs. + + Its final form is the result of the suggestions, the dicussions, and + the compromises reached by the members of the IETF MIB working group: + + Karl Auerbach, Epilogue Technology + K. Ramesh Babu, Excelan + Lawrence Besaw, Hewlett-Packard + Jeffrey D. Case, University of Tennessee at Knoxville + James R. Davin, Proteon + Mark S. Fedor, NYSERNet + Robb Foster, BBN + Phill Gross, The MITRE Corporation + Bent Torp Jensen, Convergent Technology + Lee Labarre, The MITRE Corporation + Dan Lynch, Advanced Computing Environments + Keith McCloghrie, The Wollongong Group + Dave Mackie, 3Com/Bridge + Craig Partridge, BBN (chair) + Jim Robertson, 3Com/Bridge + Marshall T. Rose, The Wollongong Group + Greg Satz, cisco + Martin Lee Schoffstall, Rensselaer Polytechnic Institute + Lou Steinberg, IBM + Dean Throop, Data General + Unni Warrier, Unisys + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 88] + +RFC 1066 MIB August 1988 + + +8. References + + [1] Cerf, V., "IAB Recommendations for the Development of Internet + Network Management Standards", RFC 1052, IAB, April 1988. + + [2] Information processing systems - Open Systems Interconnection, + "Management Information Services Definition", International + Organization for Standardization, Draft Proposal 9595/2, + December 1987. + + [3] Information processing systems - Open Systems Interconnection, + "Management Information Protocol Specification", International + Organization for Standardization, Draft Proposal 9596/2, + December 1987. + + [4] Rose M., and K. McCloghrie, "Structure and Identification of + Management Information for TCP/IP-based internets", RFC 1065, + TWG, August 1988. + + [5] Partridge C., and G. Trewitt, "The High-Level Entity Management + System (HEMS)", RFCs 1021-1024, BBN and Stanford, October 1987. + + [6] Case, J., M. Fedor, M. Schoffstall, and J. Davin, "A Simple + Network Management Protocol", RFC 1067, University of Tennessee + At Knoxville, NYSERNet, Rensselaer Polytechnic, Proteon, August + 1988. + + [7] LaBarre, L., "Structure and Identification of Management + Information for the Internet", Internet Engineering Task Force + working note, Network Information Center, SRI International, + Menlo Park, California, April 1988. + + [8] LaBarre, L., "Transport Layer Management Information: TCP", + Internet Engineering Task Force working note in preparation. + Network Information Center, SRI International, Menlo Park, + California, (unpublished). + + [9] Information processing systems - Open Systems Interconnection, + "Specification of Abstract Syntax Notation One (ASN.1)", + International Organization for Standardization, International + Standard 8824, December 1987. + + [10] Information processing systems - Open Systems Interconnection, + "Specification of Basic Encoding Rules for Abstract Notation One + (ASN.1)", International Organization for Standardization, + International Standard 8825, December 1987. + + [11] Jacobson, V., "Congestion Avoidance and Control", SIGCOMM, 1988, + + + +McCloghrie & Rose [Page 89] + +RFC 1066 MIB August 1988 + + + Stanford, California. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +McCloghrie & Rose [Page 90] + \ No newline at end of file diff --git a/ext/picotcp/RFC/rfc1122.txt b/ext/picotcp/RFC/rfc1122.txt new file mode 100644 index 0000000..c14f2e5 --- /dev/null +++ b/ext/picotcp/RFC/rfc1122.txt @@ -0,0 +1,6844 @@ + + + + + + +Network Working Group Internet Engineering Task Force +Request for Comments: 1122 R. Braden, Editor + October 1989 + + + Requirements for Internet Hosts -- Communication Layers + + +Status of This Memo + + This RFC is an official specification for the Internet community. It + incorporates by reference, amends, corrects, and supplements the + primary protocol standards documents relating to hosts. Distribution + of this document is unlimited. + +Summary + + This is one RFC of a pair that defines and discusses the requirements + for Internet host software. This RFC covers the communications + protocol layers: link layer, IP layer, and transport layer; its + companion RFC-1123 covers the application and support protocols. + + + + Table of Contents + + + + + 1. INTRODUCTION ............................................... 5 + 1.1 The Internet Architecture .............................. 6 + 1.1.1 Internet Hosts .................................... 6 + 1.1.2 Architectural Assumptions ......................... 7 + 1.1.3 Internet Protocol Suite ........................... 8 + 1.1.4 Embedded Gateway Code ............................. 10 + 1.2 General Considerations ................................. 12 + 1.2.1 Continuing Internet Evolution ..................... 12 + 1.2.2 Robustness Principle .............................. 12 + 1.2.3 Error Logging ..................................... 13 + 1.2.4 Configuration ..................................... 14 + 1.3 Reading this Document .................................. 15 + 1.3.1 Organization ...................................... 15 + 1.3.2 Requirements ...................................... 16 + 1.3.3 Terminology ....................................... 17 + 1.4 Acknowledgments ........................................ 20 + + 2. LINK LAYER .................................................. 21 + 2.1 INTRODUCTION ........................................... 21 + + + +Internet Engineering Task Force [Page 1] + + + + +RFC1122 INTRODUCTION October 1989 + + + 2.2 PROTOCOL WALK-THROUGH .................................. 21 + 2.3 SPECIFIC ISSUES ........................................ 21 + 2.3.1 Trailer Protocol Negotiation ...................... 21 + 2.3.2 Address Resolution Protocol -- ARP ................ 22 + 2.3.2.1 ARP Cache Validation ......................... 22 + 2.3.2.2 ARP Packet Queue ............................. 24 + 2.3.3 Ethernet and IEEE 802 Encapsulation ............... 24 + 2.4 LINK/INTERNET LAYER INTERFACE .......................... 25 + 2.5 LINK LAYER REQUIREMENTS SUMMARY ........................ 26 + + 3. INTERNET LAYER PROTOCOLS .................................... 27 + 3.1 INTRODUCTION ............................................ 27 + 3.2 PROTOCOL WALK-THROUGH .................................. 29 + 3.2.1 Internet Protocol -- IP ............................ 29 + 3.2.1.1 Version Number ............................... 29 + 3.2.1.2 Checksum ..................................... 29 + 3.2.1.3 Addressing ................................... 29 + 3.2.1.4 Fragmentation and Reassembly ................. 32 + 3.2.1.5 Identification ............................... 32 + 3.2.1.6 Type-of-Service .............................. 33 + 3.2.1.7 Time-to-Live ................................. 34 + 3.2.1.8 Options ...................................... 35 + 3.2.2 Internet Control Message Protocol -- ICMP .......... 38 + 3.2.2.1 Destination Unreachable ...................... 39 + 3.2.2.2 Redirect ..................................... 40 + 3.2.2.3 Source Quench ................................ 41 + 3.2.2.4 Time Exceeded ................................ 41 + 3.2.2.5 Parameter Problem ............................ 42 + 3.2.2.6 Echo Request/Reply ........................... 42 + 3.2.2.7 Information Request/Reply .................... 43 + 3.2.2.8 Timestamp and Timestamp Reply ................ 43 + 3.2.2.9 Address Mask Request/Reply ................... 45 + 3.2.3 Internet Group Management Protocol IGMP ........... 47 + 3.3 SPECIFIC ISSUES ........................................ 47 + 3.3.1 Routing Outbound Datagrams ........................ 47 + 3.3.1.1 Local/Remote Decision ........................ 47 + 3.3.1.2 Gateway Selection ............................ 48 + 3.3.1.3 Route Cache .................................. 49 + 3.3.1.4 Dead Gateway Detection ....................... 51 + 3.3.1.5 New Gateway Selection ........................ 55 + 3.3.1.6 Initialization ............................... 56 + 3.3.2 Reassembly ........................................ 56 + 3.3.3 Fragmentation ..................................... 58 + 3.3.4 Local Multihoming ................................. 60 + 3.3.4.1 Introduction ................................. 60 + 3.3.4.2 Multihoming Requirements ..................... 61 + 3.3.4.3 Choosing a Source Address .................... 64 + 3.3.5 Source Route Forwarding ........................... 65 + + + +Internet Engineering Task Force [Page 2] + + + + +RFC1122 INTRODUCTION October 1989 + + + 3.3.6 Broadcasts ........................................ 66 + 3.3.7 IP Multicasting ................................... 67 + 3.3.8 Error Reporting ................................... 69 + 3.4 INTERNET/TRANSPORT LAYER INTERFACE ..................... 69 + 3.5 INTERNET LAYER REQUIREMENTS SUMMARY .................... 72 + + 4. TRANSPORT PROTOCOLS ......................................... 77 + 4.1 USER DATAGRAM PROTOCOL -- UDP .......................... 77 + 4.1.1 INTRODUCTION ...................................... 77 + 4.1.2 PROTOCOL WALK-THROUGH ............................. 77 + 4.1.3 SPECIFIC ISSUES ................................... 77 + 4.1.3.1 Ports ........................................ 77 + 4.1.3.2 IP Options ................................... 77 + 4.1.3.3 ICMP Messages ................................ 78 + 4.1.3.4 UDP Checksums ................................ 78 + 4.1.3.5 UDP Multihoming .............................. 79 + 4.1.3.6 Invalid Addresses ............................ 79 + 4.1.4 UDP/APPLICATION LAYER INTERFACE ................... 79 + 4.1.5 UDP REQUIREMENTS SUMMARY .......................... 80 + 4.2 TRANSMISSION CONTROL PROTOCOL -- TCP ................... 82 + 4.2.1 INTRODUCTION ...................................... 82 + 4.2.2 PROTOCOL WALK-THROUGH ............................. 82 + 4.2.2.1 Well-Known Ports ............................. 82 + 4.2.2.2 Use of Push .................................. 82 + 4.2.2.3 Window Size .................................. 83 + 4.2.2.4 Urgent Pointer ............................... 84 + 4.2.2.5 TCP Options .................................. 85 + 4.2.2.6 Maximum Segment Size Option .................. 85 + 4.2.2.7 TCP Checksum ................................. 86 + 4.2.2.8 TCP Connection State Diagram ................. 86 + 4.2.2.9 Initial Sequence Number Selection ............ 87 + 4.2.2.10 Simultaneous Open Attempts .................. 87 + 4.2.2.11 Recovery from Old Duplicate SYN ............. 87 + 4.2.2.12 RST Segment ................................. 87 + 4.2.2.13 Closing a Connection ........................ 87 + 4.2.2.14 Data Communication .......................... 89 + 4.2.2.15 Retransmission Timeout ...................... 90 + 4.2.2.16 Managing the Window ......................... 91 + 4.2.2.17 Probing Zero Windows ........................ 92 + 4.2.2.18 Passive OPEN Calls .......................... 92 + 4.2.2.19 Time to Live ................................ 93 + 4.2.2.20 Event Processing ............................ 93 + 4.2.2.21 Acknowledging Queued Segments ............... 94 + 4.2.3 SPECIFIC ISSUES ................................... 95 + 4.2.3.1 Retransmission Timeout Calculation ........... 95 + 4.2.3.2 When to Send an ACK Segment .................. 96 + 4.2.3.3 When to Send a Window Update ................. 97 + 4.2.3.4 When to Send Data ............................ 98 + + + +Internet Engineering Task Force [Page 3] + + + + +RFC1122 INTRODUCTION October 1989 + + + 4.2.3.5 TCP Connection Failures ...................... 100 + 4.2.3.6 TCP Keep-Alives .............................. 101 + 4.2.3.7 TCP Multihoming .............................. 103 + 4.2.3.8 IP Options ................................... 103 + 4.2.3.9 ICMP Messages ................................ 103 + 4.2.3.10 Remote Address Validation ................... 104 + 4.2.3.11 TCP Traffic Patterns ........................ 104 + 4.2.3.12 Efficiency .................................. 105 + 4.2.4 TCP/APPLICATION LAYER INTERFACE ................... 106 + 4.2.4.1 Asynchronous Reports ......................... 106 + 4.2.4.2 Type-of-Service .............................. 107 + 4.2.4.3 Flush Call ................................... 107 + 4.2.4.4 Multihoming .................................. 108 + 4.2.5 TCP REQUIREMENT SUMMARY ........................... 108 + + 5. REFERENCES ................................................. 112 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 4] + + + + +RFC1122 INTRODUCTION October 1989 + + +1. INTRODUCTION + + This document is one of a pair that defines and discusses the + requirements for host system implementations of the Internet protocol + suite. This RFC covers the communication protocol layers: link + layer, IP layer, and transport layer. Its companion RFC, + "Requirements for Internet Hosts -- Application and Support" + [INTRO:1], covers the application layer protocols. This document + should also be read in conjunction with "Requirements for Internet + Gateways" [INTRO:2]. + + These documents are intended to provide guidance for vendors, + implementors, and users of Internet communication software. They + represent the consensus of a large body of technical experience and + wisdom, contributed by the members of the Internet research and + vendor communities. + + This RFC enumerates standard protocols that a host connected to the + Internet must use, and it incorporates by reference the RFCs and + other documents describing the current specifications for these + protocols. It corrects errors in the referenced documents and adds + additional discussion and guidance for an implementor. + + For each protocol, this document also contains an explicit set of + requirements, recommendations, and options. The reader must + understand that the list of requirements in this document is + incomplete by itself; the complete set of requirements for an + Internet host is primarily defined in the standard protocol + specification documents, with the corrections, amendments, and + supplements contained in this RFC. + + A good-faith implementation of the protocols that was produced after + careful reading of the RFC's and with some interaction with the + Internet technical community, and that followed good communications + software engineering practices, should differ from the requirements + of this document in only minor ways. Thus, in many cases, the + "requirements" in this RFC are already stated or implied in the + standard protocol documents, so that their inclusion here is, in a + sense, redundant. However, they were included because some past + implementation has made the wrong choice, causing problems of + interoperability, performance, and/or robustness. + + This document includes discussion and explanation of many of the + requirements and recommendations. A simple list of requirements + would be dangerous, because: + + o Some required features are more important than others, and some + features are optional. + + + +Internet Engineering Task Force [Page 5] + + + + +RFC1122 INTRODUCTION October 1989 + + + o There may be valid reasons why particular vendor products that + are designed for restricted contexts might choose to use + different specifications. + + However, the specifications of this document must be followed to meet + the general goal of arbitrary host interoperation across the + diversity and complexity of the Internet system. Although most + current implementations fail to meet these requirements in various + ways, some minor and some major, this specification is the ideal + towards which we need to move. + + These requirements are based on the current level of Internet + architecture. This document will be updated as required to provide + additional clarifications or to include additional information in + those areas in which specifications are still evolving. + + This introductory section begins with a brief overview of the + Internet architecture as it relates to hosts, and then gives some + general advice to host software vendors. Finally, there is some + guidance on reading the rest of the document and some terminology. + + 1.1 The Internet Architecture + + General background and discussion on the Internet architecture and + supporting protocol suite can be found in the DDN Protocol + Handbook [INTRO:3]; for background see for example [INTRO:9], + [INTRO:10], and [INTRO:11]. Reference [INTRO:5] describes the + procedure for obtaining Internet protocol documents, while + [INTRO:6] contains a list of the numbers assigned within Internet + protocols. + + 1.1.1 Internet Hosts + + A host computer, or simply "host," is the ultimate consumer of + communication services. A host generally executes application + programs on behalf of user(s), employing network and/or + Internet communication services in support of this function. + An Internet host corresponds to the concept of an "End-System" + used in the OSI protocol suite [INTRO:13]. + + An Internet communication system consists of interconnected + packet networks supporting communication among host computers + using the Internet protocols. The networks are interconnected + using packet-switching computers called "gateways" or "IP + routers" by the Internet community, and "Intermediate Systems" + by the OSI world [INTRO:13]. The RFC "Requirements for + Internet Gateways" [INTRO:2] contains the official + specifications for Internet gateways. That RFC together with + + + +Internet Engineering Task Force [Page 6] + + + + +RFC1122 INTRODUCTION October 1989 + + + the present document and its companion [INTRO:1] define the + rules for the current realization of the Internet architecture. + + Internet hosts span a wide range of size, speed, and function. + They range in size from small microprocessors through + workstations to mainframes and supercomputers. In function, + they range from single-purpose hosts (such as terminal servers) + to full-service hosts that support a variety of online network + services, typically including remote login, file transfer, and + electronic mail. + + A host is generally said to be multihomed if it has more than + one interface to the same or to different networks. See + Section 1.1.3 on "Terminology". + + 1.1.2 Architectural Assumptions + + The current Internet architecture is based on a set of + assumptions about the communication system. The assumptions + most relevant to hosts are as follows: + + (a) The Internet is a network of networks. + + Each host is directly connected to some particular + network(s); its connection to the Internet is only + conceptual. Two hosts on the same network communicate + with each other using the same set of protocols that they + would use to communicate with hosts on distant networks. + + (b) Gateways don't keep connection state information. + + To improve robustness of the communication system, + gateways are designed to be stateless, forwarding each IP + datagram independently of other datagrams. As a result, + redundant paths can be exploited to provide robust service + in spite of failures of intervening gateways and networks. + + All state information required for end-to-end flow control + and reliability is implemented in the hosts, in the + transport layer or in application programs. All + connection control information is thus co-located with the + end points of the communication, so it will be lost only + if an end point fails. + + (c) Routing complexity should be in the gateways. + + Routing is a complex and difficult problem, and ought to + be performed by the gateways, not the hosts. An important + + + +Internet Engineering Task Force [Page 7] + + + + +RFC1122 INTRODUCTION October 1989 + + + objective is to insulate host software from changes caused + by the inevitable evolution of the Internet routing + architecture. + + (d) The System must tolerate wide network variation. + + A basic objective of the Internet design is to tolerate a + wide range of network characteristics -- e.g., bandwidth, + delay, packet loss, packet reordering, and maximum packet + size. Another objective is robustness against failure of + individual networks, gateways, and hosts, using whatever + bandwidth is still available. Finally, the goal is full + "open system interconnection": an Internet host must be + able to interoperate robustly and effectively with any + other Internet host, across diverse Internet paths. + + Sometimes host implementors have designed for less + ambitious goals. For example, the LAN environment is + typically much more benign than the Internet as a whole; + LANs have low packet loss and delay and do not reorder + packets. Some vendors have fielded host implementations + that are adequate for a simple LAN environment, but work + badly for general interoperation. The vendor justifies + such a product as being economical within the restricted + LAN market. However, isolated LANs seldom stay isolated + for long; they are soon gatewayed to each other, to + organization-wide internets, and eventually to the global + Internet system. In the end, neither the customer nor the + vendor is served by incomplete or substandard Internet + host software. + + The requirements spelled out in this document are designed + for a full-function Internet host, capable of full + interoperation over an arbitrary Internet path. + + + 1.1.3 Internet Protocol Suite + + To communicate using the Internet system, a host must implement + the layered set of protocols comprising the Internet protocol + suite. A host typically must implement at least one protocol + from each layer. + + The protocol layers used in the Internet architecture are as + follows [INTRO:4]: + + + o Application Layer + + + +Internet Engineering Task Force [Page 8] + + + + +RFC1122 INTRODUCTION October 1989 + + + The application layer is the top layer of the Internet + protocol suite. The Internet suite does not further + subdivide the application layer, although some of the + Internet application layer protocols do contain some + internal sub-layering. The application layer of the + Internet suite essentially combines the functions of the + top two layers -- Presentation and Application -- of the + OSI reference model. + + We distinguish two categories of application layer + protocols: user protocols that provide service directly + to users, and support protocols that provide common system + functions. Requirements for user and support protocols + will be found in the companion RFC [INTRO:1]. + + The most common Internet user protocols are: + + o Telnet (remote login) + o FTP (file transfer) + o SMTP (electronic mail delivery) + + There are a number of other standardized user protocols + [INTRO:4] and many private user protocols. + + Support protocols, used for host name mapping, booting, + and management, include SNMP, BOOTP, RARP, and the Domain + Name System (DNS) protocols. + + + o Transport Layer + + The transport layer provides end-to-end communication + services for applications. There are two primary + transport layer protocols at present: + + o Transmission Control Protocol (TCP) + o User Datagram Protocol (UDP) + + TCP is a reliable connection-oriented transport service + that provides end-to-end reliability, resequencing, and + flow control. UDP is a connectionless ("datagram") + transport service. + + Other transport protocols have been developed by the + research community, and the set of official Internet + transport protocols may be expanded in the future. + + Transport layer protocols are discussed in Chapter 4. + + + +Internet Engineering Task Force [Page 9] + + + + +RFC1122 INTRODUCTION October 1989 + + + o Internet Layer + + All Internet transport protocols use the Internet Protocol + (IP) to carry data from source host to destination host. + IP is a connectionless or datagram internetwork service, + providing no end-to-end delivery guarantees. Thus, IP + datagrams may arrive at the destination host damaged, + duplicated, out of order, or not at all. The layers above + IP are responsible for reliable delivery service when it + is required. The IP protocol includes provision for + addressing, type-of-service specification, fragmentation + and reassembly, and security information. + + The datagram or connectionless nature of the IP protocol + is a fundamental and characteristic feature of the + Internet architecture. Internet IP was the model for the + OSI Connectionless Network Protocol [INTRO:12]. + + ICMP is a control protocol that is considered to be an + integral part of IP, although it is architecturally + layered upon IP, i.e., it uses IP to carry its data end- + to-end just as a transport protocol like TCP or UDP does. + ICMP provides error reporting, congestion reporting, and + first-hop gateway redirection. + + IGMP is an Internet layer protocol used for establishing + dynamic host groups for IP multicasting. + + The Internet layer protocols IP, ICMP, and IGMP are + discussed in Chapter 3. + + + o Link Layer + + To communicate on its directly-connected network, a host + must implement the communication protocol used to + interface to that network. We call this a link layer or + media-access layer protocol. + + There is a wide variety of link layer protocols, + corresponding to the many different types of networks. + See Chapter 2. + + + 1.1.4 Embedded Gateway Code + + Some Internet host software includes embedded gateway + functionality, so that these hosts can forward packets as a + + + +Internet Engineering Task Force [Page 10] + + + + +RFC1122 INTRODUCTION October 1989 + + + gateway would, while still performing the application layer + functions of a host. + + Such dual-purpose systems must follow the Gateway Requirements + RFC [INTRO:2] with respect to their gateway functions, and + must follow the present document with respect to their host + functions. In all overlapping cases, the two specifications + should be in agreement. + + There are varying opinions in the Internet community about + embedded gateway functionality. The main arguments are as + follows: + + o Pro: in a local network environment where networking is + informal, or in isolated internets, it may be convenient + and economical to use existing host systems as gateways. + + There is also an architectural argument for embedded + gateway functionality: multihoming is much more common + than originally foreseen, and multihoming forces a host to + make routing decisions as if it were a gateway. If the + multihomed host contains an embedded gateway, it will + have full routing knowledge and as a result will be able + to make more optimal routing decisions. + + o Con: Gateway algorithms and protocols are still changing, + and they will continue to change as the Internet system + grows larger. Attempting to include a general gateway + function within the host IP layer will force host system + maintainers to track these (more frequent) changes. Also, + a larger pool of gateway implementations will make + coordinating the changes more difficult. Finally, the + complexity of a gateway IP layer is somewhat greater than + that of a host, making the implementation and operation + tasks more complex. + + In addition, the style of operation of some hosts is not + appropriate for providing stable and robust gateway + service. + + There is considerable merit in both of these viewpoints. One + conclusion can be drawn: an host administrator must have + conscious control over whether or not a given host acts as a + gateway. See Section 3.1 for the detailed requirements. + + + + + + + +Internet Engineering Task Force [Page 11] + + + + +RFC1122 INTRODUCTION October 1989 + + + 1.2 General Considerations + + There are two important lessons that vendors of Internet host + software have learned and which a new vendor should consider + seriously. + + 1.2.1 Continuing Internet Evolution + + The enormous growth of the Internet has revealed problems of + management and scaling in a large datagram-based packet + communication system. These problems are being addressed, and + as a result there will be continuing evolution of the + specifications described in this document. These changes will + be carefully planned and controlled, since there is extensive + participation in this planning by the vendors and by the + organizations responsible for operations of the networks. + + Development, evolution, and revision are characteristic of + computer network protocols today, and this situation will + persist for some years. A vendor who develops computer + communication software for the Internet protocol suite (or any + other protocol suite!) and then fails to maintain and update + that software for changing specifications is going to leave a + trail of unhappy customers. The Internet is a large + communication network, and the users are in constant contact + through it. Experience has shown that knowledge of + deficiencies in vendor software propagates quickly through the + Internet technical community. + + 1.2.2 Robustness Principle + + At every layer of the protocols, there is a general rule whose + application can lead to enormous benefits in robustness and + interoperability [IP:1]: + + "Be liberal in what you accept, and + conservative in what you send" + + Software should be written to deal with every conceivable + error, no matter how unlikely; sooner or later a packet will + come in with that particular combination of errors and + attributes, and unless the software is prepared, chaos can + ensue. In general, it is best to assume that the network is + filled with malevolent entities that will send in packets + designed to have the worst possible effect. This assumption + will lead to suitable protective design, although the most + serious problems in the Internet have been caused by + unenvisaged mechanisms triggered by low-probability events; + + + +Internet Engineering Task Force [Page 12] + + + + +RFC1122 INTRODUCTION October 1989 + + + mere human malice would never have taken so devious a course! + + Adaptability to change must be designed into all levels of + Internet host software. As a simple example, consider a + protocol specification that contains an enumeration of values + for a particular header field -- e.g., a type field, a port + number, or an error code; this enumeration must be assumed to + be incomplete. Thus, if a protocol specification defines four + possible error codes, the software must not break when a fifth + code shows up. An undefined code might be logged (see below), + but it must not cause a failure. + + The second part of the principle is almost as important: + software on other hosts may contain deficiencies that make it + unwise to exploit legal but obscure protocol features. It is + unwise to stray far from the obvious and simple, lest untoward + effects result elsewhere. A corollary of this is "watch out + for misbehaving hosts"; host software should be prepared, not + just to survive other misbehaving hosts, but also to cooperate + to limit the amount of disruption such hosts can cause to the + shared communication facility. + + 1.2.3 Error Logging + + The Internet includes a great variety of host and gateway + systems, each implementing many protocols and protocol layers, + and some of these contain bugs and mis-features in their + Internet protocol software. As a result of complexity, + diversity, and distribution of function, the diagnosis of + Internet problems is often very difficult. + + Problem diagnosis will be aided if host implementations include + a carefully designed facility for logging erroneous or + "strange" protocol events. It is important to include as much + diagnostic information as possible when an error is logged. In + particular, it is often useful to record the header(s) of a + packet that caused an error. However, care must be taken to + ensure that error logging does not consume prohibitive amounts + of resources or otherwise interfere with the operation of the + host. + + There is a tendency for abnormal but harmless protocol events + to overflow error logging files; this can be avoided by using a + "circular" log, or by enabling logging only while diagnosing a + known failure. It may be useful to filter and count duplicate + successive messages. One strategy that seems to work well is: + (1) always count abnormalities and make such counts accessible + through the management protocol (see [INTRO:1]); and (2) allow + + + +Internet Engineering Task Force [Page 13] + + + + +RFC1122 INTRODUCTION October 1989 + + + the logging of a great variety of events to be selectively + enabled. For example, it might useful to be able to "log + everything" or to "log everything for host X". + + Note that different managements may have differing policies + about the amount of error logging that they want normally + enabled in a host. Some will say, "if it doesn't hurt me, I + don't want to know about it", while others will want to take a + more watchful and aggressive attitude about detecting and + removing protocol abnormalities. + + 1.2.4 Configuration + + It would be ideal if a host implementation of the Internet + protocol suite could be entirely self-configuring. This would + allow the whole suite to be implemented in ROM or cast into + silicon, it would simplify diskless workstations, and it would + be an immense boon to harried LAN administrators as well as + system vendors. We have not reached this ideal; in fact, we + are not even close. + + At many points in this document, you will find a requirement + that a parameter be a configurable option. There are several + different reasons behind such requirements. In a few cases, + there is current uncertainty or disagreement about the best + value, and it may be necessary to update the recommended value + in the future. In other cases, the value really depends on + external factors -- e.g., the size of the host and the + distribution of its communication load, or the speeds and + topology of nearby networks -- and self-tuning algorithms are + unavailable and may be insufficient. In some cases, + configurability is needed because of administrative + requirements. + + Finally, some configuration options are required to communicate + with obsolete or incorrect implementations of the protocols, + distributed without sources, that unfortunately persist in many + parts of the Internet. To make correct systems coexist with + these faulty systems, administrators often have to "mis- + configure" the correct systems. This problem will correct + itself gradually as the faulty systems are retired, but it + cannot be ignored by vendors. + + When we say that a parameter must be configurable, we do not + intend to require that its value be explicitly read from a + configuration file at every boot time. We recommend that + implementors set up a default for each parameter, so a + configuration file is only necessary to override those defaults + + + +Internet Engineering Task Force [Page 14] + + + + +RFC1122 INTRODUCTION October 1989 + + + that are inappropriate in a particular installation. Thus, the + configurability requirement is an assurance that it will be + POSSIBLE to override the default when necessary, even in a + binary-only or ROM-based product. + + This document requires a particular value for such defaults in + some cases. The choice of default is a sensitive issue when + the configuration item controls the accommodation to existing + faulty systems. If the Internet is to converge successfully to + complete interoperability, the default values built into + implementations must implement the official protocol, not + "mis-configurations" to accommodate faulty implementations. + Although marketing considerations have led some vendors to + choose mis-configuration defaults, we urge vendors to choose + defaults that will conform to the standard. + + Finally, we note that a vendor needs to provide adequate + documentation on all configuration parameters, their limits and + effects. + + + 1.3 Reading this Document + + 1.3.1 Organization + + Protocol layering, which is generally used as an organizing + principle in implementing network software, has also been used + to organize this document. In describing the rules, we assume + that an implementation does strictly mirror the layering of the + protocols. Thus, the following three major sections specify + the requirements for the link layer, the internet layer, and + the transport layer, respectively. A companion RFC [INTRO:1] + covers application level software. This layerist organization + was chosen for simplicity and clarity. + + However, strict layering is an imperfect model, both for the + protocol suite and for recommended implementation approaches. + Protocols in different layers interact in complex and sometimes + subtle ways, and particular functions often involve multiple + layers. There are many design choices in an implementation, + many of which involve creative "breaking" of strict layering. + Every implementor is urged to read references [INTRO:7] and + [INTRO:8]. + + This document describes the conceptual service interface + between layers using a functional ("procedure call") notation, + like that used in the TCP specification [TCP:1]. A host + implementation must support the logical information flow + + + +Internet Engineering Task Force [Page 15] + + + + +RFC1122 INTRODUCTION October 1989 + + + implied by these calls, but need not literally implement the + calls themselves. For example, many implementations reflect + the coupling between the transport layer and the IP layer by + giving them shared access to common data structures. These + data structures, rather than explicit procedure calls, are then + the agency for passing much of the information that is + required. + + In general, each major section of this document is organized + into the following subsections: + + (1) Introduction + + (2) Protocol Walk-Through -- considers the protocol + specification documents section-by-section, correcting + errors, stating requirements that may be ambiguous or + ill-defined, and providing further clarification or + explanation. + + (3) Specific Issues -- discusses protocol design and + implementation issues that were not included in the walk- + through. + + (4) Interfaces -- discusses the service interface to the next + higher layer. + + (5) Summary -- contains a summary of the requirements of the + section. + + + Under many of the individual topics in this document, there is + parenthetical material labeled "DISCUSSION" or + "IMPLEMENTATION". This material is intended to give + clarification and explanation of the preceding requirements + text. It also includes some suggestions on possible future + directions or developments. The implementation material + contains suggested approaches that an implementor may want to + consider. + + The summary sections are intended to be guides and indexes to + the text, but are necessarily cryptic and incomplete. The + summaries should never be used or referenced separately from + the complete RFC. + + 1.3.2 Requirements + + In this document, the words that are used to define the + significance of each particular requirement are capitalized. + + + +Internet Engineering Task Force [Page 16] + + + + +RFC1122 INTRODUCTION October 1989 + + + These words are: + + * "MUST" + + This word or the adjective "REQUIRED" means that the item + is an absolute requirement of the specification. + + * "SHOULD" + + This word or the adjective "RECOMMENDED" means that there + may exist valid reasons in particular circumstances to + ignore this item, but the full implications should be + understood and the case carefully weighed before choosing + a different course. + + * "MAY" + + This word or the adjective "OPTIONAL" means that this item + is truly optional. One vendor may choose to include the + item because a particular marketplace requires it or + because it enhances the product, for example; another + vendor may omit the same item. + + + An implementation is not compliant if it fails to satisfy one + or more of the MUST requirements for the protocols it + implements. An implementation that satisfies all the MUST and + all the SHOULD requirements for its protocols is said to be + "unconditionally compliant"; one that satisfies all the MUST + requirements but not all the SHOULD requirements for its + protocols is said to be "conditionally compliant". + + 1.3.3 Terminology + + This document uses the following technical terms: + + Segment + A segment is the unit of end-to-end transmission in the + TCP protocol. A segment consists of a TCP header followed + by application data. A segment is transmitted by + encapsulation inside an IP datagram. + + Message + In this description of the lower-layer protocols, a + message is the unit of transmission in a transport layer + protocol. In particular, a TCP segment is a message. A + message consists of a transport protocol header followed + by application protocol data. To be transmitted end-to- + + + +Internet Engineering Task Force [Page 17] + + + + +RFC1122 INTRODUCTION October 1989 + + + end through the Internet, a message must be encapsulated + inside a datagram. + + IP Datagram + An IP datagram is the unit of end-to-end transmission in + the IP protocol. An IP datagram consists of an IP header + followed by transport layer data, i.e., of an IP header + followed by a message. + + In the description of the internet layer (Section 3), the + unqualified term "datagram" should be understood to refer + to an IP datagram. + + Packet + A packet is the unit of data passed across the interface + between the internet layer and the link layer. It + includes an IP header and data. A packet may be a + complete IP datagram or a fragment of an IP datagram. + + Frame + A frame is the unit of transmission in a link layer + protocol, and consists of a link-layer header followed by + a packet. + + Connected Network + A network to which a host is interfaced is often known as + the "local network" or the "subnetwork" relative to that + host. However, these terms can cause confusion, and + therefore we use the term "connected network" in this + document. + + Multihomed + A host is said to be multihomed if it has multiple IP + addresses. For a discussion of multihoming, see Section + 3.3.4 below. + + Physical network interface + This is a physical interface to a connected network and + has a (possibly unique) link-layer address. Multiple + physical network interfaces on a single host may share the + same link-layer address, but the address must be unique + for different hosts on the same physical network. + + Logical [network] interface + We define a logical [network] interface to be a logical + path, distinguished by a unique IP address, to a connected + network. See Section 3.3.4. + + + + +Internet Engineering Task Force [Page 18] + + + + +RFC1122 INTRODUCTION October 1989 + + + Specific-destination address + This is the effective destination address of a datagram, + even if it is broadcast or multicast; see Section 3.2.1.3. + + Path + At a given moment, all the IP datagrams from a particular + source host to a particular destination host will + typically traverse the same sequence of gateways. We use + the term "path" for this sequence. Note that a path is + uni-directional; it is not unusual to have different paths + in the two directions between a given host pair. + + MTU + The maximum transmission unit, i.e., the size of the + largest packet that can be transmitted. + + + The terms frame, packet, datagram, message, and segment are + illustrated by the following schematic diagrams: + + A. Transmission on connected network: + _______________________________________________ + | LL hdr | IP hdr | (data) | + |________|________|_____________________________| + + <---------- Frame -----------------------------> + <----------Packet --------------------> + + + B. Before IP fragmentation or after IP reassembly: + ______________________________________ + | IP hdr | transport| Application Data | + |________|____hdr___|__________________| + + <-------- Datagram ------------------> + <-------- Message -----------> + or, for TCP: + ______________________________________ + | IP hdr | TCP hdr | Application Data | + |________|__________|__________________| + + <-------- Datagram ------------------> + <-------- Segment -----------> + + + + + + + + +Internet Engineering Task Force [Page 19] + + + + +RFC1122 INTRODUCTION October 1989 + + + 1.4 Acknowledgments + + This document incorporates contributions and comments from a large + group of Internet protocol experts, including representatives of + university and research labs, vendors, and government agencies. + It was assembled primarily by the Host Requirements Working Group + of the Internet Engineering Task Force (IETF). + + The Editor would especially like to acknowledge the tireless + dedication of the following people, who attended many long + meetings and generated 3 million bytes of electronic mail over the + past 18 months in pursuit of this document: Philip Almquist, Dave + Borman (Cray Research), Noel Chiappa, Dave Crocker (DEC), Steve + Deering (Stanford), Mike Karels (Berkeley), Phil Karn (Bellcore), + John Lekashman (NASA), Charles Lynn (BBN), Keith McCloghrie (TWG), + Paul Mockapetris (ISI), Thomas Narten (Purdue), Craig Partridge + (BBN), Drew Perkins (CMU), and James Van Bokkelen (FTP Software). + + In addition, the following people made major contributions to the + effort: Bill Barns (Mitre), Steve Bellovin (AT&T), Mike Brescia + (BBN), Ed Cain (DCA), Annette DeSchon (ISI), Martin Gross (DCA), + Phill Gross (NRI), Charles Hedrick (Rutgers), Van Jacobson (LBL), + John Klensin (MIT), Mark Lottor (SRI), Milo Medin (NASA), Bill + Melohn (Sun Microsystems), Greg Minshall (Kinetics), Jeff Mogul + (DEC), John Mullen (CMC), Jon Postel (ISI), John Romkey (Epilogue + Technology), and Mike StJohns (DCA). The following also made + significant contributions to particular areas: Eric Allman + (Berkeley), Rob Austein (MIT), Art Berggreen (ACC), Keith Bostic + (Berkeley), Vint Cerf (NRI), Wayne Hathaway (NASA), Matt Korn + (IBM), Erik Naggum (Naggum Software, Norway), Robert Ullmann + (Prime Computer), David Waitzman (BBN), Frank Wancho (USA), Arun + Welch (Ohio State), Bill Westfield (Cisco), and Rayan Zachariassen + (Toronto). + + We are grateful to all, including any contributors who may have + been inadvertently omitted from this list. + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 20] + + + + +RFC1122 LINK LAYER October 1989 + + +2. LINK LAYER + + 2.1 INTRODUCTION + + All Internet systems, both hosts and gateways, have the same + requirements for link layer protocols. These requirements are + given in Chapter 3 of "Requirements for Internet Gateways" + [INTRO:2], augmented with the material in this section. + + 2.2 PROTOCOL WALK-THROUGH + + None. + + 2.3 SPECIFIC ISSUES + + 2.3.1 Trailer Protocol Negotiation + + The trailer protocol [LINK:1] for link-layer encapsulation MAY + be used, but only when it has been verified that both systems + (host or gateway) involved in the link-layer communication + implement trailers. If the system does not dynamically + negotiate use of the trailer protocol on a per-destination + basis, the default configuration MUST disable the protocol. + + DISCUSSION: + The trailer protocol is a link-layer encapsulation + technique that rearranges the data contents of packets + sent on the physical network. In some cases, trailers + improve the throughput of higher layer protocols by + reducing the amount of data copying within the operating + system. Higher layer protocols are unaware of trailer + use, but both the sending and receiving host MUST + understand the protocol if it is used. + + Improper use of trailers can result in very confusing + symptoms. Only packets with specific size attributes are + encapsulated using trailers, and typically only a small + fraction of the packets being exchanged have these + attributes. Thus, if a system using trailers exchanges + packets with a system that does not, some packets + disappear into a black hole while others are delivered + successfully. + + IMPLEMENTATION: + On an Ethernet, packets encapsulated with trailers use a + distinct Ethernet type [LINK:1], and trailer negotiation + is performed at the time that ARP is used to discover the + link-layer address of a destination system. + + + +Internet Engineering Task Force [Page 21] + + + + +RFC1122 LINK LAYER October 1989 + + + Specifically, the ARP exchange is completed in the usual + manner using the normal IP protocol type, but a host that + wants to speak trailers will send an additional "trailer + ARP reply" packet, i.e., an ARP reply that specifies the + trailer encapsulation protocol type but otherwise has the + format of a normal ARP reply. If a host configured to use + trailers receives a trailer ARP reply message from a + remote machine, it can add that machine to the list of + machines that understand trailers, e.g., by marking the + corresponding entry in the ARP cache. + + Hosts wishing to receive trailer encapsulations send + trailer ARP replies whenever they complete exchanges of + normal ARP messages for IP. Thus, a host that received an + ARP request for its IP protocol address would send a + trailer ARP reply in addition to the normal IP ARP reply; + a host that sent the IP ARP request would send a trailer + ARP reply when it received the corresponding IP ARP reply. + In this way, either the requesting or responding host in + an IP ARP exchange may request that it receive trailer + encapsulations. + + This scheme, using extra trailer ARP reply packets rather + than sending an ARP request for the trailer protocol type, + was designed to avoid a continuous exchange of ARP packets + with a misbehaving host that, contrary to any + specification or common sense, responded to an ARP reply + for trailers with another ARP reply for IP. This problem + is avoided by sending a trailer ARP reply in response to + an IP ARP reply only when the IP ARP reply answers an + outstanding request; this is true when the hardware + address for the host is still unknown when the IP ARP + reply is received. A trailer ARP reply may always be sent + along with an IP ARP reply responding to an IP ARP + request. + + 2.3.2 Address Resolution Protocol -- ARP + + 2.3.2.1 ARP Cache Validation + + An implementation of the Address Resolution Protocol (ARP) + [LINK:2] MUST provide a mechanism to flush out-of-date cache + entries. If this mechanism involves a timeout, it SHOULD be + possible to configure the timeout value. + + A mechanism to prevent ARP flooding (repeatedly sending an + ARP Request for the same IP address, at a high rate) MUST be + included. The recommended maximum rate is 1 per second per + + + +Internet Engineering Task Force [Page 22] + + + + +RFC1122 LINK LAYER October 1989 + + + destination. + + DISCUSSION: + The ARP specification [LINK:2] suggests but does not + require a timeout mechanism to invalidate cache entries + when hosts change their Ethernet addresses. The + prevalence of proxy ARP (see Section 2.4 of [INTRO:2]) + has significantly increased the likelihood that cache + entries in hosts will become invalid, and therefore + some ARP-cache invalidation mechanism is now required + for hosts. Even in the absence of proxy ARP, a long- + period cache timeout is useful in order to + automatically correct any bad ARP data that might have + been cached. + + IMPLEMENTATION: + Four mechanisms have been used, sometimes in + combination, to flush out-of-date cache entries. + + (1) Timeout -- Periodically time out cache entries, + even if they are in use. Note that this timeout + should be restarted when the cache entry is + "refreshed" (by observing the source fields, + regardless of target address, of an ARP broadcast + from the system in question). For proxy ARP + situations, the timeout needs to be on the order + of a minute. + + (2) Unicast Poll -- Actively poll the remote host by + periodically sending a point-to-point ARP Request + to it, and delete the entry if no ARP Reply is + received from N successive polls. Again, the + timeout should be on the order of a minute, and + typically N is 2. + + (3) Link-Layer Advice -- If the link-layer driver + detects a delivery problem, flush the + corresponding ARP cache entry. + + (4) Higher-layer Advice -- Provide a call from the + Internet layer to the link layer to indicate a + delivery problem. The effect of this call would + be to invalidate the corresponding cache entry. + This call would be analogous to the + "ADVISE_DELIVPROB()" call from the transport layer + to the Internet layer (see Section 3.4), and in + fact the ADVISE_DELIVPROB routine might in turn + call the link-layer advice routine to invalidate + + + +Internet Engineering Task Force [Page 23] + + + + +RFC1122 LINK LAYER October 1989 + + + the ARP cache entry. + + Approaches (1) and (2) involve ARP cache timeouts on + the order of a minute or less. In the absence of proxy + ARP, a timeout this short could create noticeable + overhead traffic on a very large Ethernet. Therefore, + it may be necessary to configure a host to lengthen the + ARP cache timeout. + + 2.3.2.2 ARP Packet Queue + + The link layer SHOULD save (rather than discard) at least + one (the latest) packet of each set of packets destined to + the same unresolved IP address, and transmit the saved + packet when the address has been resolved. + + DISCUSSION: + Failure to follow this recommendation causes the first + packet of every exchange to be lost. Although higher- + layer protocols can generally cope with packet loss by + retransmission, packet loss does impact performance. + For example, loss of a TCP open request causes the + initial round-trip time estimate to be inflated. UDP- + based applications such as the Domain Name System are + more seriously affected. + + 2.3.3 Ethernet and IEEE 802 Encapsulation + + The IP encapsulation for Ethernets is described in RFC-894 + [LINK:3], while RFC-1042 [LINK:4] describes the IP + encapsulation for IEEE 802 networks. RFC-1042 elaborates and + replaces the discussion in Section 3.4 of [INTRO:2]. + + Every Internet host connected to a 10Mbps Ethernet cable: + + o MUST be able to send and receive packets using RFC-894 + encapsulation; + + o SHOULD be able to receive RFC-1042 packets, intermixed + with RFC-894 packets; and + + o MAY be able to send packets using RFC-1042 encapsulation. + + + An Internet host that implements sending both the RFC-894 and + the RFC-1042 encapsulations MUST provide a configuration switch + to select which is sent, and this switch MUST default to RFC- + 894. + + + +Internet Engineering Task Force [Page 24] + + + + +RFC1122 LINK LAYER October 1989 + + + Note that the standard IP encapsulation in RFC-1042 does not + use the protocol id value (K1=6) that IEEE reserved for IP; + instead, it uses a value (K1=170) that implies an extension + (the "SNAP") which can be used to hold the Ether-Type field. + An Internet system MUST NOT send 802 packets using K1=6. + + Address translation from Internet addresses to link-layer + addresses on Ethernet and IEEE 802 networks MUST be managed by + the Address Resolution Protocol (ARP). + + The MTU for an Ethernet is 1500 and for 802.3 is 1492. + + DISCUSSION: + The IEEE 802.3 specification provides for operation over a + 10Mbps Ethernet cable, in which case Ethernet and IEEE + 802.3 frames can be physically intermixed. A receiver can + distinguish Ethernet and 802.3 frames by the value of the + 802.3 Length field; this two-octet field coincides in the + header with the Ether-Type field of an Ethernet frame. In + particular, the 802.3 Length field must be less than or + equal to 1500, while all valid Ether-Type values are + greater than 1500. + + Another compatibility problem arises with link-layer + broadcasts. A broadcast sent with one framing will not be + seen by hosts that can receive only the other framing. + + The provisions of this section were designed to provide + direct interoperation between 894-capable and 1042-capable + systems on the same cable, to the maximum extent possible. + It is intended to support the present situation where + 894-only systems predominate, while providing an easy + transition to a possible future in which 1042-capable + systems become common. + + Note that 894-only systems cannot interoperate directly + with 1042-only systems. If the two system types are set + up as two different logical networks on the same cable, + they can communicate only through an IP gateway. + Furthermore, it is not useful or even possible for a + dual-format host to discover automatically which format to + send, because of the problem of link-layer broadcasts. + + 2.4 LINK/INTERNET LAYER INTERFACE + + The packet receive interface between the IP layer and the link + layer MUST include a flag to indicate whether the incoming packet + was addressed to a link-layer broadcast address. + + + +Internet Engineering Task Force [Page 25] + + + + +RFC1122 LINK LAYER October 1989 + + + DISCUSSION + Although the IP layer does not generally know link layer + addresses (since every different network medium typically has + a different address format), the broadcast address on a + broadcast-capable medium is an important special case. See + Section 3.2.2, especially the DISCUSSION concerning broadcast + storms. + + The packet send interface between the IP and link layers MUST + include the 5-bit TOS field (see Section 3.2.1.6). + + The link layer MUST NOT report a Destination Unreachable error to + IP solely because there is no ARP cache entry for a destination. + + 2.5 LINK LAYER REQUIREMENTS SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION| | | |T|T|e +--------------------------------------------------|-------|-|-|-|-|-|-- + | | | | | | | +Trailer encapsulation |2.3.1 | | |x| | | +Send Trailers by default without negotiation |2.3.1 | | | | |x| +ARP |2.3.2 | | | | | | + Flush out-of-date ARP cache entries |2.3.2.1|x| | | | | + Prevent ARP floods |2.3.2.1|x| | | | | + Cache timeout configurable |2.3.2.1| |x| | | | + Save at least one (latest) unresolved pkt |2.3.2.2| |x| | | | +Ethernet and IEEE 802 Encapsulation |2.3.3 | | | | | | + Host able to: |2.3.3 | | | | | | + Send & receive RFC-894 encapsulation |2.3.3 |x| | | | | + Receive RFC-1042 encapsulation |2.3.3 | |x| | | | + Send RFC-1042 encapsulation |2.3.3 | | |x| | | + Then config. sw. to select, RFC-894 dflt |2.3.3 |x| | | | | + Send K1=6 encapsulation |2.3.3 | | | | |x| + Use ARP on Ethernet and IEEE 802 nets |2.3.3 |x| | | | | +Link layer report b'casts to IP layer |2.4 |x| | | | | +IP layer pass TOS to link layer |2.4 |x| | | | | +No ARP cache entry treated as Dest. Unreach. |2.4 | | | | |x| + + + + + +Internet Engineering Task Force [Page 26] + + + + +RFC1122 INTERNET LAYER October 1989 + + +3. INTERNET LAYER PROTOCOLS + + 3.1 INTRODUCTION + + The Robustness Principle: "Be liberal in what you accept, and + conservative in what you send" is particularly important in the + Internet layer, where one misbehaving host can deny Internet + service to many other hosts. + + The protocol standards used in the Internet layer are: + + o RFC-791 [IP:1] defines the IP protocol and gives an + introduction to the architecture of the Internet. + + o RFC-792 [IP:2] defines ICMP, which provides routing, + diagnostic and error functionality for IP. Although ICMP + messages are encapsulated within IP datagrams, ICMP + processing is considered to be (and is typically implemented + as) part of the IP layer. See Section 3.2.2. + + o RFC-950 [IP:3] defines the mandatory subnet extension to the + addressing architecture. + + o RFC-1112 [IP:4] defines the Internet Group Management + Protocol IGMP, as part of a recommended extension to hosts + and to the host-gateway interface to support Internet-wide + multicasting at the IP level. See Section 3.2.3. + + The target of an IP multicast may be an arbitrary group of + Internet hosts. IP multicasting is designed as a natural + extension of the link-layer multicasting facilities of some + networks, and it provides a standard means for local access + to such link-layer multicasting facilities. + + Other important references are listed in Section 5 of this + document. + + The Internet layer of host software MUST implement both IP and + ICMP. See Section 3.3.7 for the requirements on support of IGMP. + + The host IP layer has two basic functions: (1) choose the "next + hop" gateway or host for outgoing IP datagrams and (2) reassemble + incoming IP datagrams. The IP layer may also (3) implement + intentional fragmentation of outgoing datagrams. Finally, the IP + layer must (4) provide diagnostic and error functionality. We + expect that IP layer functions may increase somewhat in the + future, as further Internet control and management facilities are + developed. + + + +Internet Engineering Task Force [Page 27] + + + + +RFC1122 INTERNET LAYER October 1989 + + + For normal datagrams, the processing is straightforward. For + incoming datagrams, the IP layer: + + (1) verifies that the datagram is correctly formatted; + + (2) verifies that it is destined to the local host; + + (3) processes options; + + (4) reassembles the datagram if necessary; and + + (5) passes the encapsulated message to the appropriate + transport-layer protocol module. + + For outgoing datagrams, the IP layer: + + (1) sets any fields not set by the transport layer; + + (2) selects the correct first hop on the connected network (a + process called "routing"); + + (3) fragments the datagram if necessary and if intentional + fragmentation is implemented (see Section 3.3.3); and + + (4) passes the packet(s) to the appropriate link-layer driver. + + + A host is said to be multihomed if it has multiple IP addresses. + Multihoming introduces considerable confusion and complexity into + the protocol suite, and it is an area in which the Internet + architecture falls seriously short of solving all problems. There + are two distinct problem areas in multihoming: + + (1) Local multihoming -- the host itself is multihomed; or + + (2) Remote multihoming -- the local host needs to communicate + with a remote multihomed host. + + At present, remote multihoming MUST be handled at the application + layer, as discussed in the companion RFC [INTRO:1]. A host MAY + support local multihoming, which is discussed in this document, + and in particular in Section 3.3.4. + + Any host that forwards datagrams generated by another host is + acting as a gateway and MUST also meet the specifications laid out + in the gateway requirements RFC [INTRO:2]. An Internet host that + includes embedded gateway code MUST have a configuration switch to + disable the gateway function, and this switch MUST default to the + + + +Internet Engineering Task Force [Page 28] + + + + +RFC1122 INTERNET LAYER October 1989 + + + non-gateway mode. In this mode, a datagram arriving through one + interface will not be forwarded to another host or gateway (unless + it is source-routed), regardless of whether the host is single- + homed or multihomed. The host software MUST NOT automatically + move into gateway mode if the host has more than one interface, as + the operator of the machine may neither want to provide that + service nor be competent to do so. + + In the following, the action specified in certain cases is to + "silently discard" a received datagram. This means that the + datagram will be discarded without further processing and that the + host will not send any ICMP error message (see Section 3.2.2) as a + result. However, for diagnosis of problems a host SHOULD provide + the capability of logging the error (see Section 1.2.3), including + the contents of the silently-discarded datagram, and SHOULD record + the event in a statistics counter. + + DISCUSSION: + Silent discard of erroneous datagrams is generally intended + to prevent "broadcast storms". + + 3.2 PROTOCOL WALK-THROUGH + + 3.2.1 Internet Protocol -- IP + + 3.2.1.1 Version Number: RFC-791 Section 3.1 + + A datagram whose version number is not 4 MUST be silently + discarded. + + 3.2.1.2 Checksum: RFC-791 Section 3.1 + + A host MUST verify the IP header checksum on every received + datagram and silently discard every datagram that has a bad + checksum. + + 3.2.1.3 Addressing: RFC-791 Section 3.2 + + There are now five classes of IP addresses: Class A through + Class E. Class D addresses are used for IP multicasting + [IP:4], while Class E addresses are reserved for + experimental use. + + A multicast (Class D) address is a 28-bit logical address + that stands for a group of hosts, and may be either + permanent or transient. Permanent multicast addresses are + allocated by the Internet Assigned Number Authority + [INTRO:6], while transient addresses may be allocated + + + +Internet Engineering Task Force [Page 29] + + + + +RFC1122 INTERNET LAYER October 1989 + + + dynamically to transient groups. Group membership is + determined dynamically using IGMP [IP:4]. + + We now summarize the important special cases for Class A, B, + and C IP addresses, using the following notation for an IP + address: + + { , } + + or + { , , } + + and the notation "-1" for a field that contains all 1 bits. + This notation is not intended to imply that the 1-bits in an + address mask need be contiguous. + + (a) { 0, 0 } + + This host on this network. MUST NOT be sent, except as + a source address as part of an initialization procedure + by which the host learns its own IP address. + + See also Section 3.3.6 for a non-standard use of {0,0}. + + (b) { 0, } + + Specified host on this network. It MUST NOT be sent, + except as a source address as part of an initialization + procedure by which the host learns its full IP address. + + (c) { -1, -1 } + + Limited broadcast. It MUST NOT be used as a source + address. + + A datagram with this destination address will be + received by every host on the connected physical + network but will not be forwarded outside that network. + + (d) { , -1 } + + Directed broadcast to the specified network. It MUST + NOT be used as a source address. + + (e) { , , -1 } + + Directed broadcast to the specified subnet. It MUST + NOT be used as a source address. + + + +Internet Engineering Task Force [Page 30] + + + + +RFC1122 INTERNET LAYER October 1989 + + + (f) { , -1, -1 } + + Directed broadcast to all subnets of the specified + subnetted network. It MUST NOT be used as a source + address. + + (g) { 127, } + + Internal host loopback address. Addresses of this form + MUST NOT appear outside a host. + + The is administratively assigned so that + its value will be unique in the entire world. + + IP addresses are not permitted to have the value 0 or -1 for + any of the , , or fields (except in the special cases listed above). + This implies that each of these fields will be at least two + bits long. + + For further discussion of broadcast addresses, see Section + 3.3.6. + + A host MUST support the subnet extensions to IP [IP:3]. As + a result, there will be an address mask of the form: + {-1, -1, 0} associated with each of the host's local IP + addresses; see Sections 3.2.2.9 and 3.3.1.1. + + When a host sends any datagram, the IP source address MUST + be one of its own IP addresses (but not a broadcast or + multicast address). + + A host MUST silently discard an incoming datagram that is + not destined for the host. An incoming datagram is destined + for the host if the datagram's destination address field is: + + (1) (one of) the host's IP address(es); or + + (2) an IP broadcast address valid for the connected + network; or + + (3) the address for a multicast group of which the host is + a member on the incoming physical interface. + + For most purposes, a datagram addressed to a broadcast or + multicast destination is processed as if it had been + addressed to one of the host's IP addresses; we use the term + "specific-destination address" for the equivalent local IP + + + +Internet Engineering Task Force [Page 31] + + + + +RFC1122 INTERNET LAYER October 1989 + + + address of the host. The specific-destination address is + defined to be the destination address in the IP header + unless the header contains a broadcast or multicast address, + in which case the specific-destination is an IP address + assigned to the physical interface on which the datagram + arrived. + + A host MUST silently discard an incoming datagram containing + an IP source address that is invalid by the rules of this + section. This validation could be done in either the IP + layer or by each protocol in the transport layer. + + DISCUSSION: + A mis-addressed datagram might be caused by a link- + layer broadcast of a unicast datagram or by a gateway + or host that is confused or mis-configured. + + An architectural goal for Internet hosts was to allow + IP addresses to be featureless 32-bit numbers, avoiding + algorithms that required a knowledge of the IP address + format. Otherwise, any future change in the format or + interpretation of IP addresses will require host + software changes. However, validation of broadcast and + multicast addresses violates this goal; a few other + violations are described elsewhere in this document. + + Implementers should be aware that applications + depending upon the all-subnets directed broadcast + address (f) may be unusable on some networks. All- + subnets broadcast is not widely implemented in vendor + gateways at present, and even when it is implemented, a + particular network administration may disable it in the + gateway configuration. + + 3.2.1.4 Fragmentation and Reassembly: RFC-791 Section 3.2 + + The Internet model requires that every host support + reassembly. See Sections 3.3.2 and 3.3.3 for the + requirements on fragmentation and reassembly. + + 3.2.1.5 Identification: RFC-791 Section 3.2 + + When sending an identical copy of an earlier datagram, a + host MAY optionally retain the same Identification field in + the copy. + + + + + + +Internet Engineering Task Force [Page 32] + + + + +RFC1122 INTERNET LAYER October 1989 + + + DISCUSSION: + Some Internet protocol experts have maintained that + when a host sends an identical copy of an earlier + datagram, the new copy should contain the same + Identification value as the original. There are two + suggested advantages: (1) if the datagrams are + fragmented and some of the fragments are lost, the + receiver may be able to reconstruct a complete datagram + from fragments of the original and the copies; (2) a + congested gateway might use the IP Identification field + (and Fragment Offset) to discard duplicate datagrams + from the queue. + + However, the observed patterns of datagram loss in the + Internet do not favor the probability of retransmitted + fragments filling reassembly gaps, while other + mechanisms (e.g., TCP repacketizing upon + retransmission) tend to prevent retransmission of an + identical datagram [IP:9]. Therefore, we believe that + retransmitting the same Identification field is not + useful. Also, a connectionless transport protocol like + UDP would require the cooperation of the application + programs to retain the same Identification value in + identical datagrams. + + 3.2.1.6 Type-of-Service: RFC-791 Section 3.2 + + The "Type-of-Service" byte in the IP header is divided into + two sections: the Precedence field (high-order 3 bits), and + a field that is customarily called "Type-of-Service" or + "TOS" (low-order 5 bits). In this document, all references + to "TOS" or the "TOS field" refer to the low-order 5 bits + only. + + The Precedence field is intended for Department of Defense + applications of the Internet protocols. The use of non-zero + values in this field is outside the scope of this document + and the IP standard specification. Vendors should consult + the Defense Communication Agency (DCA) for guidance on the + IP Precedence field and its implications for other protocol + layers. However, vendors should note that the use of + precedence will most likely require that its value be passed + between protocol layers in just the same way as the TOS + field is passed. + + The IP layer MUST provide a means for the transport layer to + set the TOS field of every datagram that is sent; the + default is all zero bits. The IP layer SHOULD pass received + + + +Internet Engineering Task Force [Page 33] + + + + +RFC1122 INTERNET LAYER October 1989 + + + TOS values up to the transport layer. + + The particular link-layer mappings of TOS contained in RFC- + 795 SHOULD NOT be implemented. + + DISCUSSION: + While the TOS field has been little used in the past, + it is expected to play an increasing role in the near + future. The TOS field is expected to be used to + control two aspects of gateway operations: routing and + queueing algorithms. See Section 2 of [INTRO:1] for + the requirements on application programs to specify TOS + values. + + The TOS field may also be mapped into link-layer + service selectors. This has been applied to provide + effective sharing of serial lines by different classes + of TCP traffic, for example. However, the mappings + suggested in RFC-795 for networks that were included in + the Internet as of 1981 are now obsolete. + + 3.2.1.7 Time-to-Live: RFC-791 Section 3.2 + + A host MUST NOT send a datagram with a Time-to-Live (TTL) + value of zero. + + A host MUST NOT discard a datagram just because it was + received with TTL less than 2. + + The IP layer MUST provide a means for the transport layer to + set the TTL field of every datagram that is sent. When a + fixed TTL value is used, it MUST be configurable. The + current suggested value will be published in the "Assigned + Numbers" RFC. + + DISCUSSION: + The TTL field has two functions: limit the lifetime of + TCP segments (see RFC-793 [TCP:1], p. 28), and + terminate Internet routing loops. Although TTL is a + time in seconds, it also has some attributes of a hop- + count, since each gateway is required to reduce the TTL + field by at least one. + + The intent is that TTL expiration will cause a datagram + to be discarded by a gateway but not by the destination + host; however, hosts that act as gateways by forwarding + datagrams must follow the gateway rules for TTL. + + + + +Internet Engineering Task Force [Page 34] + + + + +RFC1122 INTERNET LAYER October 1989 + + + A higher-layer protocol may want to set the TTL in + order to implement an "expanding scope" search for some + Internet resource. This is used by some diagnostic + tools, and is expected to be useful for locating the + "nearest" server of a given class using IP + multicasting, for example. A particular transport + protocol may also want to specify its own TTL bound on + maximum datagram lifetime. + + A fixed value must be at least big enough for the + Internet "diameter," i.e., the longest possible path. + A reasonable value is about twice the diameter, to + allow for continued Internet growth. + + 3.2.1.8 Options: RFC-791 Section 3.2 + + There MUST be a means for the transport layer to specify IP + options to be included in transmitted IP datagrams (see + Section 3.4). + + All IP options (except NOP or END-OF-LIST) received in + datagrams MUST be passed to the transport layer (or to ICMP + processing when the datagram is an ICMP message). The IP + and transport layer MUST each interpret those IP options + that they understand and silently ignore the others. + + Later sections of this document discuss specific IP option + support required by each of ICMP, TCP, and UDP. + + DISCUSSION: + Passing all received IP options to the transport layer + is a deliberate "violation of strict layering" that is + designed to ease the introduction of new transport- + relevant IP options in the future. Each layer must + pick out any options that are relevant to its own + processing and ignore the rest. For this purpose, + every IP option except NOP and END-OF-LIST will include + a specification of its own length. + + This document does not define the order in which a + receiver must process multiple options in the same IP + header. Hosts sending multiple options must be aware + that this introduces an ambiguity in the meaning of + certain options when combined with a source-route + option. + + IMPLEMENTATION: + The IP layer must not crash as the result of an option + + + +Internet Engineering Task Force [Page 35] + + + + +RFC1122 INTERNET LAYER October 1989 + + + length that is outside the possible range. For + example, erroneous option lengths have been observed to + put some IP implementations into infinite loops. + + Here are the requirements for specific IP options: + + + (a) Security Option + + Some environments require the Security option in every + datagram; such a requirement is outside the scope of + this document and the IP standard specification. Note, + however, that the security options described in RFC-791 + and RFC-1038 are obsolete. For DoD applications, + vendors should consult [IP:8] for guidance. + + + (b) Stream Identifier Option + + This option is obsolete; it SHOULD NOT be sent, and it + MUST be silently ignored if received. + + + (c) Source Route Options + + A host MUST support originating a source route and MUST + be able to act as the final destination of a source + route. + + If host receives a datagram containing a completed + source route (i.e., the pointer points beyond the last + field), the datagram has reached its final destination; + the option as received (the recorded route) MUST be + passed up to the transport layer (or to ICMP message + processing). This recorded route will be reversed and + used to form a return source route for reply datagrams + (see discussion of IP Options in Section 4). When a + return source route is built, it MUST be correctly + formed even if the recorded route included the source + host (see case (B) in the discussion below). + + An IP header containing more than one Source Route + option MUST NOT be sent; the effect on routing of + multiple Source Route options is implementation- + specific. + + Section 3.3.5 presents the rules for a host acting as + an intermediate hop in a source route, i.e., forwarding + + + +Internet Engineering Task Force [Page 36] + + + + +RFC1122 INTERNET LAYER October 1989 + + + a source-routed datagram. + + DISCUSSION: + If a source-routed datagram is fragmented, each + fragment will contain a copy of the source route. + Since the processing of IP options (including a + source route) must precede reassembly, the + original datagram will not be reassembled until + the final destination is reached. + + Suppose a source routed datagram is to be routed + from host S to host D via gateways G1, G2, ... Gn. + There was an ambiguity in the specification over + whether the source route option in a datagram sent + out by S should be (A) or (B): + + (A): {>>G2, G3, ... Gn, D} <--- CORRECT + + (B): {S, >>G2, G3, ... Gn, D} <---- WRONG + + (where >> represents the pointer). If (A) is + sent, the datagram received at D will contain the + option: {G1, G2, ... Gn >>}, with S and D as the + IP source and destination addresses. If (B) were + sent, the datagram received at D would again + contain S and D as the same IP source and + destination addresses, but the option would be: + {S, G1, ...Gn >>}; i.e., the originating host + would be the first hop in the route. + + + (d) Record Route Option + + Implementation of originating and processing the Record + Route option is OPTIONAL. + + + (e) Timestamp Option + + Implementation of originating and processing the + Timestamp option is OPTIONAL. If it is implemented, + the following rules apply: + + o The originating host MUST record a timestamp in a + Timestamp option whose Internet address fields are + not pre-specified or whose first pre-specified + address is the host's interface address. + + + + +Internet Engineering Task Force [Page 37] + + + + +RFC1122 INTERNET LAYER October 1989 + + + o The destination host MUST (if possible) add the + current timestamp to a Timestamp option before + passing the option to the transport layer or to + ICMP for processing. + + o A timestamp value MUST follow the rules given in + Section 3.2.2.8 for the ICMP Timestamp message. + + + 3.2.2 Internet Control Message Protocol -- ICMP + + ICMP messages are grouped into two classes. + + * + ICMP error messages: + + Destination Unreachable (see Section 3.2.2.1) + Redirect (see Section 3.2.2.2) + Source Quench (see Section 3.2.2.3) + Time Exceeded (see Section 3.2.2.4) + Parameter Problem (see Section 3.2.2.5) + + + * + ICMP query messages: + + Echo (see Section 3.2.2.6) + Information (see Section 3.2.2.7) + Timestamp (see Section 3.2.2.8) + Address Mask (see Section 3.2.2.9) + + + If an ICMP message of unknown type is received, it MUST be + silently discarded. + + Every ICMP error message includes the Internet header and at + least the first 8 data octets of the datagram that triggered + the error; more than 8 octets MAY be sent; this header and data + MUST be unchanged from the received datagram. + + In those cases where the Internet layer is required to pass an + ICMP error message to the transport layer, the IP protocol + number MUST be extracted from the original header and used to + select the appropriate transport protocol entity to handle the + error. + + An ICMP error message SHOULD be sent with normal (i.e., zero) + TOS bits. + + + +Internet Engineering Task Force [Page 38] + + + + +RFC1122 INTERNET LAYER October 1989 + + + An ICMP error message MUST NOT be sent as the result of + receiving: + + * an ICMP error message, or + + * a datagram destined to an IP broadcast or IP multicast + address, or + + * a datagram sent as a link-layer broadcast, or + + * a non-initial fragment, or + + * a datagram whose source address does not define a single + host -- e.g., a zero address, a loopback address, a + broadcast address, a multicast address, or a Class E + address. + + NOTE: THESE RESTRICTIONS TAKE PRECEDENCE OVER ANY REQUIREMENT + ELSEWHERE IN THIS DOCUMENT FOR SENDING ICMP ERROR MESSAGES. + + DISCUSSION: + These rules will prevent the "broadcast storms" that have + resulted from hosts returning ICMP error messages in + response to broadcast datagrams. For example, a broadcast + UDP segment to a non-existent port could trigger a flood + of ICMP Destination Unreachable datagrams from all + machines that do not have a client for that destination + port. On a large Ethernet, the resulting collisions can + render the network useless for a second or more. + + Every datagram that is broadcast on the connected network + should have a valid IP broadcast address as its IP + destination (see Section 3.3.6). However, some hosts + violate this rule. To be certain to detect broadcast + datagrams, therefore, hosts are required to check for a + link-layer broadcast as well as an IP-layer broadcast + address. + + IMPLEMENTATION: + This requires that the link layer inform the IP layer when + a link-layer broadcast datagram has been received; see + Section 2.4. + + 3.2.2.1 Destination Unreachable: RFC-792 + + The following additional codes are hereby defined: + + 6 = destination network unknown + + + +Internet Engineering Task Force [Page 39] + + + + +RFC1122 INTERNET LAYER October 1989 + + + 7 = destination host unknown + + 8 = source host isolated + + 9 = communication with destination network + administratively prohibited + + 10 = communication with destination host + administratively prohibited + + 11 = network unreachable for type of service + + 12 = host unreachable for type of service + + A host SHOULD generate Destination Unreachable messages with + code: + + 2 (Protocol Unreachable), when the designated transport + protocol is not supported; or + + 3 (Port Unreachable), when the designated transport + protocol (e.g., UDP) is unable to demultiplex the + datagram but has no protocol mechanism to inform the + sender. + + A Destination Unreachable message that is received MUST be + reported to the transport layer. The transport layer SHOULD + use the information appropriately; for example, see Sections + 4.1.3.3, 4.2.3.9, and 4.2.4 below. A transport protocol + that has its own mechanism for notifying the sender that a + port is unreachable (e.g., TCP, which sends RST segments) + MUST nevertheless accept an ICMP Port Unreachable for the + same purpose. + + A Destination Unreachable message that is received with code + 0 (Net), 1 (Host), or 5 (Bad Source Route) may result from a + routing transient and MUST therefore be interpreted as only + a hint, not proof, that the specified destination is + unreachable [IP:11]. For example, it MUST NOT be used as + proof of a dead gateway (see Section 3.3.1). + + 3.2.2.2 Redirect: RFC-792 + + A host SHOULD NOT send an ICMP Redirect message; Redirects + are to be sent only by gateways. + + A host receiving a Redirect message MUST update its routing + information accordingly. Every host MUST be prepared to + + + +Internet Engineering Task Force [Page 40] + + + + +RFC1122 INTERNET LAYER October 1989 + + + accept both Host and Network Redirects and to process them + as described in Section 3.3.1.2 below. + + A Redirect message SHOULD be silently discarded if the new + gateway address it specifies is not on the same connected + (sub-) net through which the Redirect arrived [INTRO:2, + Appendix A], or if the source of the Redirect is not the + current first-hop gateway for the specified destination (see + Section 3.3.1). + + 3.2.2.3 Source Quench: RFC-792 + + A host MAY send a Source Quench message if it is + approaching, or has reached, the point at which it is forced + to discard incoming datagrams due to a shortage of + reassembly buffers or other resources. See Section 2.2.3 of + [INTRO:2] for suggestions on when to send Source Quench. + + If a Source Quench message is received, the IP layer MUST + report it to the transport layer (or ICMP processing). In + general, the transport or application layer SHOULD implement + a mechanism to respond to Source Quench for any protocol + that can send a sequence of datagrams to the same + destination and which can reasonably be expected to maintain + enough state information to make this feasible. See Section + 4 for the handling of Source Quench by TCP and UDP. + + DISCUSSION: + A Source Quench may be generated by the target host or + by some gateway in the path of a datagram. The host + receiving a Source Quench should throttle itself back + for a period of time, then gradually increase the + transmission rate again. The mechanism to respond to + Source Quench may be in the transport layer (for + connection-oriented protocols like TCP) or in the + application layer (for protocols that are built on top + of UDP). + + A mechanism has been proposed [IP:14] to make the IP + layer respond directly to Source Quench by controlling + the rate at which datagrams are sent, however, this + proposal is currently experimental and not currently + recommended. + + 3.2.2.4 Time Exceeded: RFC-792 + + An incoming Time Exceeded message MUST be passed to the + transport layer. + + + +Internet Engineering Task Force [Page 41] + + + + +RFC1122 INTERNET LAYER October 1989 + + + DISCUSSION: + A gateway will send a Time Exceeded Code 0 (In Transit) + message when it discards a datagram due to an expired + TTL field. This indicates either a gateway routing + loop or too small an initial TTL value. + + A host may receive a Time Exceeded Code 1 (Reassembly + Timeout) message from a destination host that has timed + out and discarded an incomplete datagram; see Section + 3.3.2 below. In the future, receipt of this message + might be part of some "MTU discovery" procedure, to + discover the maximum datagram size that can be sent on + the path without fragmentation. + + 3.2.2.5 Parameter Problem: RFC-792 + + A host SHOULD generate Parameter Problem messages. An + incoming Parameter Problem message MUST be passed to the + transport layer, and it MAY be reported to the user. + + DISCUSSION: + The ICMP Parameter Problem message is sent to the + source host for any problem not specifically covered by + another ICMP message. Receipt of a Parameter Problem + message generally indicates some local or remote + implementation error. + + A new variant on the Parameter Problem message is hereby + defined: + Code 1 = required option is missing. + + DISCUSSION: + This variant is currently in use in the military + community for a missing security option. + + 3.2.2.6 Echo Request/Reply: RFC-792 + + Every host MUST implement an ICMP Echo server function that + receives Echo Requests and sends corresponding Echo Replies. + A host SHOULD also implement an application-layer interface + for sending an Echo Request and receiving an Echo Reply, for + diagnostic purposes. + + An ICMP Echo Request destined to an IP broadcast or IP + multicast address MAY be silently discarded. + + + + + + +Internet Engineering Task Force [Page 42] + + + + +RFC1122 INTERNET LAYER October 1989 + + + DISCUSSION: + This neutral provision results from a passionate debate + between those who feel that ICMP Echo to a broadcast + address provides a valuable diagnostic capability and + those who feel that misuse of this feature can too + easily create packet storms. + + The IP source address in an ICMP Echo Reply MUST be the same + as the specific-destination address (defined in Section + 3.2.1.3) of the corresponding ICMP Echo Request message. + + Data received in an ICMP Echo Request MUST be entirely + included in the resulting Echo Reply. However, if sending + the Echo Reply requires intentional fragmentation that is + not implemented, the datagram MUST be truncated to maximum + transmission size (see Section 3.3.3) and sent. + + Echo Reply messages MUST be passed to the ICMP user + interface, unless the corresponding Echo Request originated + in the IP layer. + + If a Record Route and/or Time Stamp option is received in an + ICMP Echo Request, this option (these options) SHOULD be + updated to include the current host and included in the IP + header of the Echo Reply message, without "truncation". + Thus, the recorded route will be for the entire round trip. + + If a Source Route option is received in an ICMP Echo + Request, the return route MUST be reversed and used as a + Source Route option for the Echo Reply message. + + 3.2.2.7 Information Request/Reply: RFC-792 + + A host SHOULD NOT implement these messages. + + DISCUSSION: + The Information Request/Reply pair was intended to + support self-configuring systems such as diskless + workstations, to allow them to discover their IP + network numbers at boot time. However, the RARP and + BOOTP protocols provide better mechanisms for a host to + discover its own IP address. + + 3.2.2.8 Timestamp and Timestamp Reply: RFC-792 + + A host MAY implement Timestamp and Timestamp Reply. If they + are implemented, the following rules MUST be followed. + + + + +Internet Engineering Task Force [Page 43] + + + + +RFC1122 INTERNET LAYER October 1989 + + + o The ICMP Timestamp server function returns a Timestamp + Reply to every Timestamp message that is received. If + this function is implemented, it SHOULD be designed for + minimum variability in delay (e.g., implemented in the + kernel to avoid delay in scheduling a user process). + + The following cases for Timestamp are to be handled + according to the corresponding rules for ICMP Echo: + + o An ICMP Timestamp Request message to an IP broadcast or + IP multicast address MAY be silently discarded. + + o The IP source address in an ICMP Timestamp Reply MUST + be the same as the specific-destination address of the + corresponding Timestamp Request message. + + o If a Source-route option is received in an ICMP Echo + Request, the return route MUST be reversed and used as + a Source Route option for the Timestamp Reply message. + + o If a Record Route and/or Timestamp option is received + in a Timestamp Request, this (these) option(s) SHOULD + be updated to include the current host and included in + the IP header of the Timestamp Reply message. + + o Incoming Timestamp Reply messages MUST be passed up to + the ICMP user interface. + + The preferred form for a timestamp value (the "standard + value") is in units of milliseconds since midnight Universal + Time. However, it may be difficult to provide this value + with millisecond resolution. For example, many systems use + clocks that update only at line frequency, 50 or 60 times + per second. Therefore, some latitude is allowed in a + "standard value": + + (a) A "standard value" MUST be updated at least 15 times + per second (i.e., at most the six low-order bits of the + value may be undefined). + + (b) The accuracy of a "standard value" MUST approximate + that of operator-set CPU clocks, i.e., correct within a + few minutes. + + + + + + + + +Internet Engineering Task Force [Page 44] + + + + +RFC1122 INTERNET LAYER October 1989 + + + 3.2.2.9 Address Mask Request/Reply: RFC-950 + + A host MUST support the first, and MAY implement all three, + of the following methods for determining the address mask(s) + corresponding to its IP address(es): + + (1) static configuration information; + + (2) obtaining the address mask(s) dynamically as a side- + effect of the system initialization process (see + [INTRO:1]); and + + (3) sending ICMP Address Mask Request(s) and receiving ICMP + Address Mask Reply(s). + + The choice of method to be used in a particular host MUST be + configurable. + + When method (3), the use of Address Mask messages, is + enabled, then: + + (a) When it initializes, the host MUST broadcast an Address + Mask Request message on the connected network + corresponding to the IP address. It MUST retransmit + this message a small number of times if it does not + receive an immediate Address Mask Reply. + + (b) Until it has received an Address Mask Reply, the host + SHOULD assume a mask appropriate for the address class + of the IP address, i.e., assume that the connected + network is not subnetted. + + (c) The first Address Mask Reply message received MUST be + used to set the address mask corresponding to the + particular local IP address. This is true even if the + first Address Mask Reply message is "unsolicited", in + which case it will have been broadcast and may arrive + after the host has ceased to retransmit Address Mask + Requests. Once the mask has been set by an Address + Mask Reply, later Address Mask Reply messages MUST be + (silently) ignored. + + Conversely, if Address Mask messages are disabled, then no + ICMP Address Mask Requests will be sent, and any ICMP + Address Mask Replies received for that local IP address MUST + be (silently) ignored. + + A host SHOULD make some reasonableness check on any address + + + +Internet Engineering Task Force [Page 45] + + + + +RFC1122 INTERNET LAYER October 1989 + + + mask it installs; see IMPLEMENTATION section below. + + A system MUST NOT send an Address Mask Reply unless it is an + authoritative agent for address masks. An authoritative + agent may be a host or a gateway, but it MUST be explicitly + configured as a address mask agent. Receiving an address + mask via an Address Mask Reply does not give the receiver + authority and MUST NOT be used as the basis for issuing + Address Mask Replies. + + With a statically configured address mask, there SHOULD be + an additional configuration flag that determines whether the + host is to act as an authoritative agent for this mask, + i.e., whether it will answer Address Mask Request messages + using this mask. + + If it is configured as an agent, the host MUST broadcast an + Address Mask Reply for the mask on the appropriate interface + when it initializes. + + See "System Initialization" in [INTRO:1] for more + information about the use of Address Mask Request/Reply + messages. + + DISCUSSION + Hosts that casually send Address Mask Replies with + invalid address masks have often been a serious + nuisance. To prevent this, Address Mask Replies ought + to be sent only by authoritative agents that have been + selected by explicit administrative action. + + When an authoritative agent receives an Address Mask + Request message, it will send a unicast Address Mask + Reply to the source IP address. If the network part of + this address is zero (see (a) and (b) in 3.2.1.3), the + Reply will be broadcast. + + Getting no reply to its Address Mask Request messages, + a host will assume there is no agent and use an + unsubnetted mask, but the agent may be only temporarily + unreachable. An agent will broadcast an unsolicited + Address Mask Reply whenever it initializes, in order to + update the masks of all hosts that have initialized in + the meantime. + + IMPLEMENTATION: + The following reasonableness check on an address mask + is suggested: the mask is not all 1 bits, and it is + + + +Internet Engineering Task Force [Page 46] + + + + +RFC1122 INTERNET LAYER October 1989 + + + either zero or else the 8 highest-order bits are on. + + 3.2.3 Internet Group Management Protocol IGMP + + IGMP [IP:4] is a protocol used between hosts and gateways on a + single network to establish hosts' membership in particular + multicast groups. The gateways use this information, in + conjunction with a multicast routing protocol, to support IP + multicasting across the Internet. + + At this time, implementation of IGMP is OPTIONAL; see Section + 3.3.7 for more information. Without IGMP, a host can still + participate in multicasting local to its connected networks. + + 3.3 SPECIFIC ISSUES + + 3.3.1 Routing Outbound Datagrams + + The IP layer chooses the correct next hop for each datagram it + sends. If the destination is on a connected network, the + datagram is sent directly to the destination host; otherwise, + it has to be routed to a gateway on a connected network. + + 3.3.1.1 Local/Remote Decision + + To decide if the destination is on a connected network, the + following algorithm MUST be used [see IP:3]: + + (a) The address mask (particular to a local IP address for + a multihomed host) is a 32-bit mask that selects the + network number and subnet number fields of the + corresponding IP address. + + (b) If the IP destination address bits extracted by the + address mask match the IP source address bits extracted + by the same mask, then the destination is on the + corresponding connected network, and the datagram is to + be transmitted directly to the destination host. + + (c) If not, then the destination is accessible only through + a gateway. Selection of a gateway is described below + (3.3.1.2). + + A special-case destination address is handled as follows: + + * For a limited broadcast or a multicast address, simply + pass the datagram to the link layer for the appropriate + interface. + + + +Internet Engineering Task Force [Page 47] + + + + +RFC1122 INTERNET LAYER October 1989 + + + * For a (network or subnet) directed broadcast, the + datagram can use the standard routing algorithms. + + The host IP layer MUST operate correctly in a minimal + network environment, and in particular, when there are no + gateways. For example, if the IP layer of a host insists on + finding at least one gateway to initialize, the host will be + unable to operate on a single isolated broadcast net. + + 3.3.1.2 Gateway Selection + + To efficiently route a series of datagrams to the same + destination, the source host MUST keep a "route cache" of + mappings to next-hop gateways. A host uses the following + basic algorithm on this cache to route a datagram; this + algorithm is designed to put the primary routing burden on + the gateways [IP:11]. + + (a) If the route cache contains no information for a + particular destination, the host chooses a "default" + gateway and sends the datagram to it. It also builds a + corresponding Route Cache entry. + + (b) If that gateway is not the best next hop to the + destination, the gateway will forward the datagram to + the best next-hop gateway and return an ICMP Redirect + message to the source host. + + (c) When it receives a Redirect, the host updates the + next-hop gateway in the appropriate route cache entry, + so later datagrams to the same destination will go + directly to the best gateway. + + Since the subnet mask appropriate to the destination address + is generally not known, a Network Redirect message SHOULD be + treated identically to a Host Redirect message; i.e., the + cache entry for the destination host (only) would be updated + (or created, if an entry for that host did not exist) for + the new gateway. + + DISCUSSION: + This recommendation is to protect against gateways that + erroneously send Network Redirects for a subnetted + network, in violation of the gateway requirements + [INTRO:2]. + + When there is no route cache entry for the destination host + address (and the destination is not on the connected + + + +Internet Engineering Task Force [Page 48] + + + + +RFC1122 INTERNET LAYER October 1989 + + + network), the IP layer MUST pick a gateway from its list of + "default" gateways. The IP layer MUST support multiple + default gateways. + + As an extra feature, a host IP layer MAY implement a table + of "static routes". Each such static route MAY include a + flag specifying whether it may be overridden by ICMP + Redirects. + + DISCUSSION: + A host generally needs to know at least one default + gateway to get started. This information can be + obtained from a configuration file or else from the + host startup sequence, e.g., the BOOTP protocol (see + [INTRO:1]). + + It has been suggested that a host can augment its list + of default gateways by recording any new gateways it + learns about. For example, it can record every gateway + to which it is ever redirected. Such a feature, while + possibly useful in some circumstances, may cause + problems in other cases (e.g., gateways are not all + equal), and it is not recommended. + + A static route is typically a particular preset mapping + from destination host or network into a particular + next-hop gateway; it might also depend on the Type-of- + Service (see next section). Static routes would be set + up by system administrators to override the normal + automatic routing mechanism, to handle exceptional + situations. However, any static routing information is + a potential source of failure as configurations change + or equipment fails. + + 3.3.1.3 Route Cache + + Each route cache entry needs to include the following + fields: + + (1) Local IP address (for a multihomed host) + + (2) Destination IP address + + (3) Type(s)-of-Service + + (4) Next-hop gateway IP address + + Field (2) MAY be the full IP address of the destination + + + +Internet Engineering Task Force [Page 49] + + + + +RFC1122 INTERNET LAYER October 1989 + + + host, or only the destination network number. Field (3), + the TOS, SHOULD be included. + + See Section 3.3.4.2 for a discussion of the implications of + multihoming for the lookup procedure in this cache. + + DISCUSSION: + Including the Type-of-Service field in the route cache + and considering it in the host route algorithm will + provide the necessary mechanism for the future when + Type-of-Service routing is commonly used in the + Internet. See Section 3.2.1.6. + + Each route cache entry defines the endpoints of an + Internet path. Although the connecting path may change + dynamically in an arbitrary way, the transmission + characteristics of the path tend to remain + approximately constant over a time period longer than a + single typical host-host transport connection. + Therefore, a route cache entry is a natural place to + cache data on the properties of the path. Examples of + such properties might be the maximum unfragmented + datagram size (see Section 3.3.3), or the average + round-trip delay measured by a transport protocol. + This data will generally be both gathered and used by a + higher layer protocol, e.g., by TCP, or by an + application using UDP. Experiments are currently in + progress on caching path properties in this manner. + + There is no consensus on whether the route cache should + be keyed on destination host addresses alone, or allow + both host and network addresses. Those who favor the + use of only host addresses argue that: + + (1) As required in Section 3.3.1.2, Redirect messages + will generally result in entries keyed on + destination host addresses; the simplest and most + general scheme would be to use host addresses + always. + + (2) The IP layer may not always know the address mask + for a network address in a complex subnetted + environment. + + (3) The use of only host addresses allows the + destination address to be used as a pure 32-bit + number, which may allow the Internet architecture + to be more easily extended in the future without + + + +Internet Engineering Task Force [Page 50] + + + + +RFC1122 INTERNET LAYER October 1989 + + + any change to the hosts. + + The opposing view is that allowing a mixture of + destination hosts and networks in the route cache: + + (1) Saves memory space. + + (2) Leads to a simpler data structure, easily + combining the cache with the tables of default and + static routes (see below). + + (3) Provides a more useful place to cache path + properties, as discussed earlier. + + + IMPLEMENTATION: + The cache needs to be large enough to include entries + for the maximum number of destination hosts that may be + in use at one time. + + A route cache entry may also include control + information used to choose an entry for replacement. + This might take the form of a "recently used" bit, a + use count, or a last-used timestamp, for example. It + is recommended that it include the time of last + modification of the entry, for diagnostic purposes. + + An implementation may wish to reduce the overhead of + scanning the route cache for every datagram to be + transmitted. This may be accomplished with a hash + table to speed the lookup, or by giving a connection- + oriented transport protocol a "hint" or temporary + handle on the appropriate cache entry, to be passed to + the IP layer with each subsequent datagram. + + Although we have described the route cache, the lists + of default gateways, and a table of static routes as + conceptually distinct, in practice they may be combined + into a single "routing table" data structure. + + 3.3.1.4 Dead Gateway Detection + + The IP layer MUST be able to detect the failure of a "next- + hop" gateway that is listed in its route cache and to choose + an alternate gateway (see Section 3.3.1.5). + + Dead gateway detection is covered in some detail in RFC-816 + [IP:11]. Experience to date has not produced a complete + + + +Internet Engineering Task Force [Page 51] + + + + +RFC1122 INTERNET LAYER October 1989 + + + algorithm which is totally satisfactory, though it has + identified several forbidden paths and promising techniques. + + * A particular gateway SHOULD NOT be used indefinitely in + the absence of positive indications that it is + functioning. + + * Active probes such as "pinging" (i.e., using an ICMP + Echo Request/Reply exchange) are expensive and scale + poorly. In particular, hosts MUST NOT actively check + the status of a first-hop gateway by simply pinging the + gateway continuously. + + * Even when it is the only effective way to verify a + gateway's status, pinging MUST be used only when + traffic is being sent to the gateway and when there is + no other positive indication to suggest that the + gateway is functioning. + + * To avoid pinging, the layers above and/or below the + Internet layer SHOULD be able to give "advice" on the + status of route cache entries when either positive + (gateway OK) or negative (gateway dead) information is + available. + + + DISCUSSION: + If an implementation does not include an adequate + mechanism for detecting a dead gateway and re-routing, + a gateway failure may cause datagrams to apparently + vanish into a "black hole". This failure can be + extremely confusing for users and difficult for network + personnel to debug. + + The dead-gateway detection mechanism must not cause + unacceptable load on the host, on connected networks, + or on first-hop gateway(s). The exact constraints on + the timeliness of dead gateway detection and on + acceptable load may vary somewhat depending on the + nature of the host's mission, but a host generally + needs to detect a failed first-hop gateway quickly + enough that transport-layer connections will not break + before an alternate gateway can be selected. + + Passing advice from other layers of the protocol stack + complicates the interfaces between the layers, but it + is the preferred approach to dead gateway detection. + Advice can come from almost any part of the IP/TCP + + + +Internet Engineering Task Force [Page 52] + + + + +RFC1122 INTERNET LAYER October 1989 + + + architecture, but it is expected to come primarily from + the transport and link layers. Here are some possible + sources for gateway advice: + + o TCP or any connection-oriented transport protocol + should be able to give negative advice, e.g., + triggered by excessive retransmissions. + + o TCP may give positive advice when (new) data is + acknowledged. Even though the route may be + asymmetric, an ACK for new data proves that the + acknowleged data must have been transmitted + successfully. + + o An ICMP Redirect message from a particular gateway + should be used as positive advice about that + gateway. + + o Link-layer information that reliably detects and + reports host failures (e.g., ARPANET Destination + Dead messages) should be used as negative advice. + + o Failure to ARP or to re-validate ARP mappings may + be used as negative advice for the corresponding + IP address. + + o Packets arriving from a particular link-layer + address are evidence that the system at this + address is alive. However, turning this + information into advice about gateways requires + mapping the link-layer address into an IP address, + and then checking that IP address against the + gateways pointed to by the route cache. This is + probably prohibitively inefficient. + + Note that positive advice that is given for every + datagram received may cause unacceptable overhead in + the implementation. + + While advice might be passed using required arguments + in all interfaces to the IP layer, some transport and + application layer protocols cannot deduce the correct + advice. These interfaces must therefore allow a + neutral value for advice, since either always-positive + or always-negative advice leads to incorrect behavior. + + There is another technique for dead gateway detection + that has been commonly used but is not recommended. + + + +Internet Engineering Task Force [Page 53] + + + + +RFC1122 INTERNET LAYER October 1989 + + + This technique depends upon the host passively + receiving ("wiretapping") the Interior Gateway Protocol + (IGP) datagrams that the gateways are broadcasting to + each other. This approach has the drawback that a host + needs to recognize all the interior gateway protocols + that gateways may use (see [INTRO:2]). In addition, it + only works on a broadcast network. + + At present, pinging (i.e., using ICMP Echo messages) is + the mechanism for gateway probing when absolutely + required. A successful ping guarantees that the + addressed interface and its associated machine are up, + but it does not guarantee that the machine is a gateway + as opposed to a host. The normal inference is that if + a Redirect or other evidence indicates that a machine + was a gateway, successful pings will indicate that the + machine is still up and hence still a gateway. + However, since a host silently discards packets that a + gateway would forward or redirect, this assumption + could sometimes fail. To avoid this problem, a new + ICMP message under development will ask "are you a + gateway?" + + IMPLEMENTATION: + The following specific algorithm has been suggested: + + o Associate a "reroute timer" with each gateway + pointed to by the route cache. Initialize the + timer to a value Tr, which must be small enough to + allow detection of a dead gateway before transport + connections time out. + + o Positive advice would reset the reroute timer to + Tr. Negative advice would reduce or zero the + reroute timer. + + o Whenever the IP layer used a particular gateway to + route a datagram, it would check the corresponding + reroute timer. If the timer had expired (reached + zero), the IP layer would send a ping to the + gateway, followed immediately by the datagram. + + o The ping (ICMP Echo) would be sent again if + necessary, up to N times. If no ping reply was + received in N tries, the gateway would be assumed + to have failed, and a new first-hop gateway would + be chosen for all cache entries pointing to the + failed gateway. + + + +Internet Engineering Task Force [Page 54] + + + + +RFC1122 INTERNET LAYER October 1989 + + + Note that the size of Tr is inversely related to the + amount of advice available. Tr should be large enough + to insure that: + + * Any pinging will be at a low level (e.g., <10%) of + all packets sent to a gateway from the host, AND + + * pinging is infrequent (e.g., every 3 minutes) + + Since the recommended algorithm is concerned with the + gateways pointed to by route cache entries, rather than + the cache entries themselves, a two level data + structure (perhaps coordinated with ARP or similar + caches) may be desirable for implementing a route + cache. + + 3.3.1.5 New Gateway Selection + + If the failed gateway is not the current default, the IP + layer can immediately switch to a default gateway. If it is + the current default that failed, the IP layer MUST select a + different default gateway (assuming more than one default is + known) for the failed route and for establishing new routes. + + DISCUSSION: + When a gateway does fail, the other gateways on the + connected network will learn of the failure through + some inter-gateway routing protocol. However, this + will not happen instantaneously, since gateway routing + protocols typically have a settling time of 30-60 + seconds. If the host switches to an alternative + gateway before the gateways have agreed on the failure, + the new target gateway will probably forward the + datagram to the failed gateway and send a Redirect back + to the host pointing to the failed gateway (!). The + result is likely to be a rapid oscillation in the + contents of the host's route cache during the gateway + settling period. It has been proposed that the dead- + gateway logic should include some hysteresis mechanism + to prevent such oscillations. However, experience has + not shown any harm from such oscillations, since + service cannot be restored to the host until the + gateways' routing information does settle down. + + IMPLEMENTATION: + One implementation technique for choosing a new default + gateway is to simply round-robin among the default + gateways in the host's list. Another is to rank the + + + +Internet Engineering Task Force [Page 55] + + + + +RFC1122 INTERNET LAYER October 1989 + + + gateways in priority order, and when the current + default gateway is not the highest priority one, to + "ping" the higher-priority gateways slowly to detect + when they return to service. This pinging can be at a + very low rate, e.g., 0.005 per second. + + 3.3.1.6 Initialization + + The following information MUST be configurable: + + (1) IP address(es). + + (2) Address mask(s). + + (3) A list of default gateways, with a preference level. + + A manual method of entering this configuration data MUST be + provided. In addition, a variety of methods can be used to + determine this information dynamically; see the section on + "Host Initialization" in [INTRO:1]. + + DISCUSSION: + Some host implementations use "wiretapping" of gateway + protocols on a broadcast network to learn what gateways + exist. A standard method for default gateway discovery + is under development. + + 3.3.2 Reassembly + + The IP layer MUST implement reassembly of IP datagrams. + + We designate the largest datagram size that can be reassembled + by EMTU_R ("Effective MTU to receive"); this is sometimes + called the "reassembly buffer size". EMTU_R MUST be greater + than or equal to 576, SHOULD be either configurable or + indefinite, and SHOULD be greater than or equal to the MTU of + the connected network(s). + + DISCUSSION: + A fixed EMTU_R limit should not be built into the code + because some application layer protocols require EMTU_R + values larger than 576. + + IMPLEMENTATION: + An implementation may use a contiguous reassembly buffer + for each datagram, or it may use a more complex data + structure that places no definite limit on the reassembled + datagram size; in the latter case, EMTU_R is said to be + + + +Internet Engineering Task Force [Page 56] + + + + +RFC1122 INTERNET LAYER October 1989 + + + "indefinite". + + Logically, reassembly is performed by simply copying each + fragment into the packet buffer at the proper offset. + Note that fragments may overlap if successive + retransmissions use different packetizing but the same + reassembly Id. + + The tricky part of reassembly is the bookkeeping to + determine when all bytes of the datagram have been + reassembled. We recommend Clark's algorithm [IP:10] that + requires no additional data space for the bookkeeping. + However, note that, contrary to [IP:10], the first + fragment header needs to be saved for inclusion in a + possible ICMP Time Exceeded (Reassembly Timeout) message. + + There MUST be a mechanism by which the transport layer can + learn MMS_R, the maximum message size that can be received and + reassembled in an IP datagram (see GET_MAXSIZES calls in + Section 3.4). If EMTU_R is not indefinite, then the value of + MMS_R is given by: + + MMS_R = EMTU_R - 20 + + since 20 is the minimum size of an IP header. + + There MUST be a reassembly timeout. The reassembly timeout + value SHOULD be a fixed value, not set from the remaining TTL. + It is recommended that the value lie between 60 seconds and 120 + seconds. If this timeout expires, the partially-reassembled + datagram MUST be discarded and an ICMP Time Exceeded message + sent to the source host (if fragment zero has been received). + + DISCUSSION: + The IP specification says that the reassembly timeout + should be the remaining TTL from the IP header, but this + does not work well because gateways generally treat TTL as + a simple hop count rather than an elapsed time. If the + reassembly timeout is too small, datagrams will be + discarded unnecessarily, and communication may fail. The + timeout needs to be at least as large as the typical + maximum delay across the Internet. A realistic minimum + reassembly timeout would be 60 seconds. + + It has been suggested that a cache might be kept of + round-trip times measured by transport protocols for + various destinations, and that these values might be used + to dynamically determine a reasonable reassembly timeout + + + +Internet Engineering Task Force [Page 57] + + + + +RFC1122 INTERNET LAYER October 1989 + + + value. Further investigation of this approach is + required. + + If the reassembly timeout is set too high, buffer + resources in the receiving host will be tied up too long, + and the MSL (Maximum Segment Lifetime) [TCP:1] will be + larger than necessary. The MSL controls the maximum rate + at which fragmented datagrams can be sent using distinct + values of the 16-bit Ident field; a larger MSL lowers the + maximum rate. The TCP specification [TCP:1] arbitrarily + assumes a value of 2 minutes for MSL. This sets an upper + limit on a reasonable reassembly timeout value. + + 3.3.3 Fragmentation + + Optionally, the IP layer MAY implement a mechanism to fragment + outgoing datagrams intentionally. + + We designate by EMTU_S ("Effective MTU for sending") the + maximum IP datagram size that may be sent, for a particular + combination of IP source and destination addresses and perhaps + TOS. + + A host MUST implement a mechanism to allow the transport layer + to learn MMS_S, the maximum transport-layer message size that + may be sent for a given {source, destination, TOS} triplet (see + GET_MAXSIZES call in Section 3.4). If no local fragmentation + is performed, the value of MMS_S will be: + + MMS_S = EMTU_S - + + and EMTU_S must be less than or equal to the MTU of the network + interface corresponding to the source address of the datagram. + Note that in this equation will be 20, unless + the IP reserves space to insert IP options for its own purposes + in addition to any options inserted by the transport layer. + + A host that does not implement local fragmentation MUST ensure + that the transport layer (for TCP) or the application layer + (for UDP) obtains MMS_S from the IP layer and does not send a + datagram exceeding MMS_S in size. + + It is generally desirable to avoid local fragmentation and to + choose EMTU_S low enough to avoid fragmentation in any gateway + along the path. In the absence of actual knowledge of the + minimum MTU along the path, the IP layer SHOULD use + EMTU_S <= 576 whenever the destination address is not on a + connected network, and otherwise use the connected network's + + + +Internet Engineering Task Force [Page 58] + + + + +RFC1122 INTERNET LAYER October 1989 + + + MTU. + + The MTU of each physical interface MUST be configurable. + + A host IP layer implementation MAY have a configuration flag + "All-Subnets-MTU", indicating that the MTU of the connected + network is to be used for destinations on different subnets + within the same network, but not for other networks. Thus, + this flag causes the network class mask, rather than the subnet + address mask, to be used to choose an EMTU_S. For a multihomed + host, an "All-Subnets-MTU" flag is needed for each network + interface. + + DISCUSSION: + Picking the correct datagram size to use when sending data + is a complex topic [IP:9]. + + (a) In general, no host is required to accept an IP + datagram larger than 576 bytes (including header and + data), so a host must not send a larger datagram + without explicit knowledge or prior arrangement with + the destination host. Thus, MMS_S is only an upper + bound on the datagram size that a transport protocol + may send; even when MMS_S exceeds 556, the transport + layer must limit its messages to 556 bytes in the + absence of other knowledge about the destination + host. + + (b) Some transport protocols (e.g., TCP) provide a way to + explicitly inform the sender about the largest + datagram the other end can receive and reassemble + [IP:7]. There is no corresponding mechanism in the + IP layer. + + A transport protocol that assumes an EMTU_R larger + than 576 (see Section 3.3.2), can send a datagram of + this larger size to another host that implements the + same protocol. + + (c) Hosts should ideally limit their EMTU_S for a given + destination to the minimum MTU of all the networks + along the path, to avoid any fragmentation. IP + fragmentation, while formally correct, can create a + serious transport protocol performance problem, + because loss of a single fragment means all the + fragments in the segment must be retransmitted + [IP:9]. + + + + +Internet Engineering Task Force [Page 59] + + + + +RFC1122 INTERNET LAYER October 1989 + + + Since nearly all networks in the Internet currently + support an MTU of 576 or greater, we strongly recommend + the use of 576 for datagrams sent to non-local networks. + + It has been suggested that a host could determine the MTU + over a given path by sending a zero-offset datagram + fragment and waiting for the receiver to time out the + reassembly (which cannot complete!) and return an ICMP + Time Exceeded message. This message would include the + largest remaining fragment header in its body. More + direct mechanisms are being experimented with, but have + not yet been adopted (see e.g., RFC-1063). + + 3.3.4 Local Multihoming + + 3.3.4.1 Introduction + + A multihomed host has multiple IP addresses, which we may + think of as "logical interfaces". These logical interfaces + may be associated with one or more physical interfaces, and + these physical interfaces may be connected to the same or + different networks. + + Here are some important cases of multihoming: + + (a) Multiple Logical Networks + + The Internet architects envisioned that each physical + network would have a single unique IP network (or + subnet) number. However, LAN administrators have + sometimes found it useful to violate this assumption, + operating a LAN with multiple logical networks per + physical connected network. + + If a host connected to such a physical network is + configured to handle traffic for each of N different + logical networks, then the host will have N logical + interfaces. These could share a single physical + interface, or might use N physical interfaces to the + same network. + + (b) Multiple Logical Hosts + + When a host has multiple IP addresses that all have the + same part (and the same part, if any), the logical interfaces are known + as "logical hosts". These logical interfaces might + share a single physical interface or might use separate + + + +Internet Engineering Task Force [Page 60] + + + + +RFC1122 INTERNET LAYER October 1989 + + + physical interfaces to the same physical network. + + (c) Simple Multihoming + + In this case, each logical interface is mapped into a + separate physical interface and each physical interface + is connected to a different physical network. The term + "multihoming" was originally applied only to this case, + but it is now applied more generally. + + A host with embedded gateway functionality will + typically fall into the simple multihoming case. Note, + however, that a host may be simply multihomed without + containing an embedded gateway, i.e., without + forwarding datagrams from one connected network to + another. + + This case presents the most difficult routing problems. + The choice of interface (i.e., the choice of first-hop + network) may significantly affect performance or even + reachability of remote parts of the Internet. + + + Finally, we note another possibility that is NOT + multihoming: one logical interface may be bound to multiple + physical interfaces, in order to increase the reliability or + throughput between directly connected machines by providing + alternative physical paths between them. For instance, two + systems might be connected by multiple point-to-point links. + We call this "link-layer multiplexing". With link-layer + multiplexing, the protocols above the link layer are unaware + that multiple physical interfaces are present; the link- + layer device driver is responsible for multiplexing and + routing packets across the physical interfaces. + + In the Internet protocol architecture, a transport protocol + instance ("entity") has no address of its own, but instead + uses a single Internet Protocol (IP) address. This has + implications for the IP, transport, and application layers, + and for the interfaces between them. In particular, the + application software may have to be aware of the multiple IP + addresses of a multihomed host; in other cases, the choice + can be made within the network software. + + 3.3.4.2 Multihoming Requirements + + The following general rules apply to the selection of an IP + source address for sending a datagram from a multihomed + + + +Internet Engineering Task Force [Page 61] + + + + +RFC1122 INTERNET LAYER October 1989 + + + host. + + (1) If the datagram is sent in response to a received + datagram, the source address for the response SHOULD be + the specific-destination address of the request. See + Sections 4.1.3.5 and 4.2.3.7 and the "General Issues" + section of [INTRO:1] for more specific requirements on + higher layers. + + Otherwise, a source address must be selected. + + (2) An application MUST be able to explicitly specify the + source address for initiating a connection or a + request. + + (3) In the absence of such a specification, the networking + software MUST choose a source address. Rules for this + choice are described below. + + + There are two key requirement issues related to multihoming: + + (A) A host MAY silently discard an incoming datagram whose + destination address does not correspond to the physical + interface through which it is received. + + (B) A host MAY restrict itself to sending (non-source- + routed) IP datagrams only through the physical + interface that corresponds to the IP source address of + the datagrams. + + + DISCUSSION: + Internet host implementors have used two different + conceptual models for multihoming, briefly summarized + in the following discussion. This document takes no + stand on which model is preferred; each seems to have a + place. This ambivalence is reflected in the issues (A) + and (B) being optional. + + o Strong ES Model + + The Strong ES (End System, i.e., host) model + emphasizes the host/gateway (ES/IS) distinction, + and would therefore substitute MUST for MAY in + issues (A) and (B) above. It tends to model a + multihomed host as a set of logical hosts within + the same physical host. + + + +Internet Engineering Task Force [Page 62] + + + + +RFC1122 INTERNET LAYER October 1989 + + + With respect to (A), proponents of the Strong ES + model note that automatic Internet routing + mechanisms could not route a datagram to a + physical interface that did not correspond to the + destination address. + + Under the Strong ES model, the route computation + for an outgoing datagram is the mapping: + + route(src IP addr, dest IP addr, TOS) + -> gateway + + Here the source address is included as a parameter + in order to select a gateway that is directly + reachable on the corresponding physical interface. + Note that this model logically requires that in + general there be at least one default gateway, and + preferably multiple defaults, for each IP source + address. + + o Weak ES Model + + This view de-emphasizes the ES/IS distinction, and + would therefore substitute MUST NOT for MAY in + issues (A) and (B). This model may be the more + natural one for hosts that wiretap gateway routing + protocols, and is necessary for hosts that have + embedded gateway functionality. + + The Weak ES Model may cause the Redirect mechanism + to fail. If a datagram is sent out a physical + interface that does not correspond to the + destination address, the first-hop gateway will + not realize when it needs to send a Redirect. On + the other hand, if the host has embedded gateway + functionality, then it has routing information + without listening to Redirects. + + In the Weak ES model, the route computation for an + outgoing datagram is the mapping: + + route(dest IP addr, TOS) -> gateway, interface + + + + + + + + + +Internet Engineering Task Force [Page 63] + + + + +RFC1122 INTERNET LAYER October 1989 + + + 3.3.4.3 Choosing a Source Address + + DISCUSSION: + When it sends an initial connection request (e.g., a + TCP "SYN" segment) or a datagram service request (e.g., + a UDP-based query), the transport layer on a multihomed + host needs to know which source address to use. If the + application does not specify it, the transport layer + must ask the IP layer to perform the conceptual + mapping: + + GET_SRCADDR(remote IP addr, TOS) + -> local IP address + + Here TOS is the Type-of-Service value (see Section + 3.2.1.6), and the result is the desired source address. + The following rules are suggested for implementing this + mapping: + + (a) If the remote Internet address lies on one of the + (sub-) nets to which the host is directly + connected, a corresponding source address may be + chosen, unless the corresponding interface is + known to be down. + + (b) The route cache may be consulted, to see if there + is an active route to the specified destination + network through any network interface; if so, a + local IP address corresponding to that interface + may be chosen. + + (c) The table of static routes, if any (see Section + 3.3.1.2) may be similarly consulted. + + (d) The default gateways may be consulted. If these + gateways are assigned to different interfaces, the + interface corresponding to the gateway with the + highest preference may be chosen. + + In the future, there may be a defined way for a + multihomed host to ask the gateways on all connected + networks for advice about the best network to use for a + given destination. + + IMPLEMENTATION: + It will be noted that this process is essentially the + same as datagram routing (see Section 3.3.1), and + therefore hosts may be able to combine the + + + +Internet Engineering Task Force [Page 64] + + + + +RFC1122 INTERNET LAYER October 1989 + + + implementation of the two functions. + + 3.3.5 Source Route Forwarding + + Subject to restrictions given below, a host MAY be able to act + as an intermediate hop in a source route, forwarding a source- + routed datagram to the next specified hop. + + However, in performing this gateway-like function, the host + MUST obey all the relevant rules for a gateway forwarding + source-routed datagrams [INTRO:2]. This includes the following + specific provisions, which override the corresponding host + provisions given earlier in this document: + + (A) TTL (ref. Section 3.2.1.7) + + The TTL field MUST be decremented and the datagram perhaps + discarded as specified for a gateway in [INTRO:2]. + + (B) ICMP Destination Unreachable (ref. Section 3.2.2.1) + + A host MUST be able to generate Destination Unreachable + messages with the following codes: + + 4 (Fragmentation Required but DF Set) when a source- + routed datagram cannot be fragmented to fit into the + target network; + + 5 (Source Route Failed) when a source-routed datagram + cannot be forwarded, e.g., because of a routing + problem or because the next hop of a strict source + route is not on a connected network. + + (C) IP Source Address (ref. Section 3.2.1.3) + + A source-routed datagram being forwarded MAY (and normally + will) have a source address that is not one of the IP + addresses of the forwarding host. + + (D) Record Route Option (ref. Section 3.2.1.8d) + + A host that is forwarding a source-routed datagram + containing a Record Route option MUST update that option, + if it has room. + + (E) Timestamp Option (ref. Section 3.2.1.8e) + + A host that is forwarding a source-routed datagram + + + +Internet Engineering Task Force [Page 65] + + + + +RFC1122 INTERNET LAYER October 1989 + + + containing a Timestamp Option MUST add the current + timestamp to that option, according to the rules for this + option. + + To define the rules restricting host forwarding of source- + routed datagrams, we use the term "local source-routing" if the + next hop will be through the same physical interface through + which the datagram arrived; otherwise, it is "non-local + source-routing". + + o A host is permitted to perform local source-routing + without restriction. + + o A host that supports non-local source-routing MUST have a + configurable switch to disable forwarding, and this switch + MUST default to disabled. + + o The host MUST satisfy all gateway requirements for + configurable policy filters [INTRO:2] restricting non- + local forwarding. + + If a host receives a datagram with an incomplete source route + but does not forward it for some reason, the host SHOULD return + an ICMP Destination Unreachable (code 5, Source Route Failed) + message, unless the datagram was itself an ICMP error message. + + 3.3.6 Broadcasts + + Section 3.2.1.3 defined the four standard IP broadcast address + forms: + + Limited Broadcast: {-1, -1} + + Directed Broadcast: {,-1} + + Subnet Directed Broadcast: + {,,-1} + + All-Subnets Directed Broadcast: {,-1,-1} + + A host MUST recognize any of these forms in the destination + address of an incoming datagram. + + There is a class of hosts* that use non-standard broadcast + address forms, substituting 0 for -1. All hosts SHOULD +_________________________ +*4.2BSD Unix and its derivatives, but not 4.3BSD. + + + + +Internet Engineering Task Force [Page 66] + + + + +RFC1122 INTERNET LAYER October 1989 + + + recognize and accept any of these non-standard broadcast + addresses as the destination address of an incoming datagram. + A host MAY optionally have a configuration option to choose the + 0 or the -1 form of broadcast address, for each physical + interface, but this option SHOULD default to the standard (-1) + form. + + When a host sends a datagram to a link-layer broadcast address, + the IP destination address MUST be a legal IP broadcast or IP + multicast address. + + A host SHOULD silently discard a datagram that is received via + a link-layer broadcast (see Section 2.4) but does not specify + an IP multicast or broadcast destination address. + + Hosts SHOULD use the Limited Broadcast address to broadcast to + a connected network. + + + DISCUSSION: + Using the Limited Broadcast address instead of a Directed + Broadcast address may improve system robustness. Problems + are often caused by machines that do not understand the + plethora of broadcast addresses (see Section 3.2.1.3), or + that may have different ideas about which broadcast + addresses are in use. The prime example of the latter is + machines that do not understand subnetting but are + attached to a subnetted net. Sending a Subnet Broadcast + for the connected network will confuse those machines, + which will see it as a message to some other host. + + There has been discussion on whether a datagram addressed + to the Limited Broadcast address ought to be sent from all + the interfaces of a multihomed host. This specification + takes no stand on the issue. + + 3.3.7 IP Multicasting + + A host SHOULD support local IP multicasting on all connected + networks for which a mapping from Class D IP addresses to + link-layer addresses has been specified (see below). Support + for local IP multicasting includes sending multicast datagrams, + joining multicast groups and receiving multicast datagrams, and + leaving multicast groups. This implies support for all of + [IP:4] except the IGMP protocol itself, which is OPTIONAL. + + + + + + +Internet Engineering Task Force [Page 67] + + + + +RFC1122 INTERNET LAYER October 1989 + + + DISCUSSION: + IGMP provides gateways that are capable of multicast + routing with the information required to support IP + multicasting across multiple networks. At this time, + multicast-routing gateways are in the experimental stage + and are not widely available. For hosts that are not + connected to networks with multicast-routing gateways or + that do not need to receive multicast datagrams + originating on other networks, IGMP serves no purpose and + is therefore optional for now. However, the rest of + [IP:4] is currently recommended for the purpose of + providing IP-layer access to local network multicast + addressing, as a preferable alternative to local broadcast + addressing. It is expected that IGMP will become + recommended at some future date, when multicast-routing + gateways have become more widely available. + + If IGMP is not implemented, a host SHOULD still join the "all- + hosts" group (224.0.0.1) when the IP layer is initialized and + remain a member for as long as the IP layer is active. + + DISCUSSION: + Joining the "all-hosts" group will support strictly local + uses of multicasting, e.g., a gateway discovery protocol, + even if IGMP is not implemented. + + The mapping of IP Class D addresses to local addresses is + currently specified for the following types of networks: + + o Ethernet/IEEE 802.3, as defined in [IP:4]. + + o Any network that supports broadcast but not multicast, + addressing: all IP Class D addresses map to the local + broadcast address. + + o Any type of point-to-point link (e.g., SLIP or HDLC + links): no mapping required. All IP multicast datagrams + are sent as-is, inside the local framing. + + Mappings for other types of networks will be specified in the + future. + + A host SHOULD provide a way for higher-layer protocols or + applications to determine which of the host's connected + network(s) support IP multicast addressing. + + + + + + +Internet Engineering Task Force [Page 68] + + + + +RFC1122 INTERNET LAYER October 1989 + + + 3.3.8 Error Reporting + + Wherever practical, hosts MUST return ICMP error datagrams on + detection of an error, except in those cases where returning an + ICMP error message is specifically prohibited. + + DISCUSSION: + A common phenomenon in datagram networks is the "black + hole disease": datagrams are sent out, but nothing comes + back. Without any error datagrams, it is difficult for + the user to figure out what the problem is. + + 3.4 INTERNET/TRANSPORT LAYER INTERFACE + + The interface between the IP layer and the transport layer MUST + provide full access to all the mechanisms of the IP layer, + including options, Type-of-Service, and Time-to-Live. The + transport layer MUST either have mechanisms to set these interface + parameters, or provide a path to pass them through from an + application, or both. + + DISCUSSION: + Applications are urged to make use of these mechanisms where + applicable, even when the mechanisms are not currently + effective in the Internet (e.g., TOS). This will allow these + mechanisms to be immediately useful when they do become + effective, without a large amount of retrofitting of host + software. + + We now describe a conceptual interface between the transport layer + and the IP layer, as a set of procedure calls. This is an + extension of the information in Section 3.3 of RFC-791 [IP:1]. + + + * Send Datagram + + SEND(src, dst, prot, TOS, TTL, BufPTR, len, Id, DF, opt + => result ) + + where the parameters are defined in RFC-791. Passing an Id + parameter is optional; see Section 3.2.1.5. + + + * Receive Datagram + + RECV(BufPTR, prot + => result, src, dst, SpecDest, TOS, len, opt) + + + + +Internet Engineering Task Force [Page 69] + + + + +RFC1122 INTERNET LAYER October 1989 + + + All the parameters are defined in RFC-791, except for: + + SpecDest = specific-destination address of datagram + (defined in Section 3.2.1.3) + + The result parameter dst contains the datagram's destination + address. Since this may be a broadcast or multicast address, + the SpecDest parameter (not shown in RFC-791) MUST be passed. + The parameter opt contains all the IP options received in the + datagram; these MUST also be passed to the transport layer. + + + * Select Source Address + + GET_SRCADDR(remote, TOS) -> local + + remote = remote IP address + TOS = Type-of-Service + local = local IP address + + See Section 3.3.4.3. + + + * Find Maximum Datagram Sizes + + GET_MAXSIZES(local, remote, TOS) -> MMS_R, MMS_S + + MMS_R = maximum receive transport-message size. + MMS_S = maximum send transport-message size. + (local, remote, TOS defined above) + + See Sections 3.3.2 and 3.3.3. + + + * Advice on Delivery Success + + ADVISE_DELIVPROB(sense, local, remote, TOS) + + Here the parameter sense is a 1-bit flag indicating whether + positive or negative advice is being given; see the + discussion in Section 3.3.1.4. The other parameters were + defined earlier. + + + * Send ICMP Message + + SEND_ICMP(src, dst, TOS, TTL, BufPTR, len, Id, DF, opt) + -> result + + + +Internet Engineering Task Force [Page 70] + + + + +RFC1122 INTERNET LAYER October 1989 + + + (Parameters defined in RFC-791). + + Passing an Id parameter is optional; see Section 3.2.1.5. + The transport layer MUST be able to send certain ICMP + messages: Port Unreachable or any of the query-type + messages. This function could be considered to be a special + case of the SEND() call, of course; we describe it separately + for clarity. + + + * Receive ICMP Message + + RECV_ICMP(BufPTR ) -> result, src, dst, len, opt + + (Parameters defined in RFC-791). + + The IP layer MUST pass certain ICMP messages up to the + appropriate transport-layer routine. This function could be + considered to be a special case of the RECV() call, of + course; we describe it separately for clarity. + + For an ICMP error message, the data that is passed up MUST + include the original Internet header plus all the octets of + the original message that are included in the ICMP message. + This data will be used by the transport layer to locate the + connection state information, if any. + + In particular, the following ICMP messages are to be passed + up: + + o Destination Unreachable + + o Source Quench + + o Echo Reply (to ICMP user interface, unless the Echo + Request originated in the IP layer) + + o Timestamp Reply (to ICMP user interface) + + o Time Exceeded + + + DISCUSSION: + In the future, there may be additions to this interface to + pass path data (see Section 3.3.1.3) between the IP and + transport layers. + + + + + +Internet Engineering Task Force [Page 71] + + + + +RFC1122 INTERNET LAYER October 1989 + + + 3.5 INTERNET LAYER REQUIREMENTS SUMMARY + + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-------------------------------------------------|--------|-|-|-|-|-|-- + | | | | | | | +Implement IP and ICMP |3.1 |x| | | | | +Handle remote multihoming in application layer |3.1 |x| | | | | +Support local multihoming |3.1 | | |x| | | +Meet gateway specs if forward datagrams |3.1 |x| | | | | +Configuration switch for embedded gateway |3.1 |x| | | | |1 + Config switch default to non-gateway |3.1 |x| | | | |1 + Auto-config based on number of interfaces |3.1 | | | | |x|1 +Able to log discarded datagrams |3.1 | |x| | | | + Record in counter |3.1 | |x| | | | + | | | | | | | +Silently discard Version != 4 |3.2.1.1 |x| | | | | +Verify IP checksum, silently discard bad dgram |3.2.1.2 |x| | | | | +Addressing: | | | | | | | + Subnet addressing (RFC-950) |3.2.1.3 |x| | | | | + Src address must be host's own IP address |3.2.1.3 |x| | | | | + Silently discard datagram with bad dest addr |3.2.1.3 |x| | | | | + Silently discard datagram with bad src addr |3.2.1.3 |x| | | | | +Support reassembly |3.2.1.4 |x| | | | | +Retain same Id field in identical datagram |3.2.1.5 | | |x| | | + | | | | | | | +TOS: | | | | | | | + Allow transport layer to set TOS |3.2.1.6 |x| | | | | + Pass received TOS up to transport layer |3.2.1.6 | |x| | | | + Use RFC-795 link-layer mappings for TOS |3.2.1.6 | | | |x| | +TTL: | | | | | | | + Send packet with TTL of 0 |3.2.1.7 | | | | |x| + Discard received packets with TTL < 2 |3.2.1.7 | | | | |x| + Allow transport layer to set TTL |3.2.1.7 |x| | | | | + Fixed TTL is configurable |3.2.1.7 |x| | | | | + | | | | | | | +IP Options: | | | | | | | + Allow transport layer to send IP options |3.2.1.8 |x| | | | | + Pass all IP options rcvd to higher layer |3.2.1.8 |x| | | | | + + + +Internet Engineering Task Force [Page 72] + + + + +RFC1122 INTERNET LAYER October 1989 + + + IP layer silently ignore unknown options |3.2.1.8 |x| | | | | + Security option |3.2.1.8a| | |x| | | + Send Stream Identifier option |3.2.1.8b| | | |x| | + Silently ignore Stream Identifer option |3.2.1.8b|x| | | | | + Record Route option |3.2.1.8d| | |x| | | + Timestamp option |3.2.1.8e| | |x| | | +Source Route Option: | | | | | | | + Originate & terminate Source Route options |3.2.1.8c|x| | | | | + Datagram with completed SR passed up to TL |3.2.1.8c|x| | | | | + Build correct (non-redundant) return route |3.2.1.8c|x| | | | | + Send multiple SR options in one header |3.2.1.8c| | | | |x| + | | | | | | | +ICMP: | | | | | | | + Silently discard ICMP msg with unknown type |3.2.2 |x| | | | | + Include more than 8 octets of orig datagram |3.2.2 | | |x| | | + Included octets same as received |3.2.2 |x| | | | | + Demux ICMP Error to transport protocol |3.2.2 |x| | | | | + Send ICMP error message with TOS=0 |3.2.2 | |x| | | | + Send ICMP error message for: | | | | | | | + - ICMP error msg |3.2.2 | | | | |x| + - IP b'cast or IP m'cast |3.2.2 | | | | |x| + - Link-layer b'cast |3.2.2 | | | | |x| + - Non-initial fragment |3.2.2 | | | | |x| + - Datagram with non-unique src address |3.2.2 | | | | |x| + Return ICMP error msgs (when not prohibited) |3.3.8 |x| | | | | + | | | | | | | + Dest Unreachable: | | | | | | | + Generate Dest Unreachable (code 2/3) |3.2.2.1 | |x| | | | + Pass ICMP Dest Unreachable to higher layer |3.2.2.1 |x| | | | | + Higher layer act on Dest Unreach |3.2.2.1 | |x| | | | + Interpret Dest Unreach as only hint |3.2.2.1 |x| | | | | + Redirect: | | | | | | | + Host send Redirect |3.2.2.2 | | | |x| | + Update route cache when recv Redirect |3.2.2.2 |x| | | | | + Handle both Host and Net Redirects |3.2.2.2 |x| | | | | + Discard illegal Redirect |3.2.2.2 | |x| | | | + Source Quench: | | | | | | | + Send Source Quench if buffering exceeded |3.2.2.3 | | |x| | | + Pass Source Quench to higher layer |3.2.2.3 |x| | | | | + Higher layer act on Source Quench |3.2.2.3 | |x| | | | + Time Exceeded: pass to higher layer |3.2.2.4 |x| | | | | + Parameter Problem: | | | | | | | + Send Parameter Problem messages |3.2.2.5 | |x| | | | + Pass Parameter Problem to higher layer |3.2.2.5 |x| | | | | + Report Parameter Problem to user |3.2.2.5 | | |x| | | + | | | | | | | + ICMP Echo Request or Reply: | | | | | | | + Echo server and Echo client |3.2.2.6 |x| | | | | + + + +Internet Engineering Task Force [Page 73] + + + + +RFC1122 INTERNET LAYER October 1989 + + + Echo client |3.2.2.6 | |x| | | | + Discard Echo Request to broadcast address |3.2.2.6 | | |x| | | + Discard Echo Request to multicast address |3.2.2.6 | | |x| | | + Use specific-dest addr as Echo Reply src |3.2.2.6 |x| | | | | + Send same data in Echo Reply |3.2.2.6 |x| | | | | + Pass Echo Reply to higher layer |3.2.2.6 |x| | | | | + Reflect Record Route, Time Stamp options |3.2.2.6 | |x| | | | + Reverse and reflect Source Route option |3.2.2.6 |x| | | | | + | | | | | | | + ICMP Information Request or Reply: |3.2.2.7 | | | |x| | + ICMP Timestamp and Timestamp Reply: |3.2.2.8 | | |x| | | + Minimize delay variability |3.2.2.8 | |x| | | |1 + Silently discard b'cast Timestamp |3.2.2.8 | | |x| | |1 + Silently discard m'cast Timestamp |3.2.2.8 | | |x| | |1 + Use specific-dest addr as TS Reply src |3.2.2.8 |x| | | | |1 + Reflect Record Route, Time Stamp options |3.2.2.6 | |x| | | |1 + Reverse and reflect Source Route option |3.2.2.8 |x| | | | |1 + Pass Timestamp Reply to higher layer |3.2.2.8 |x| | | | |1 + Obey rules for "standard value" |3.2.2.8 |x| | | | |1 + | | | | | | | + ICMP Address Mask Request and Reply: | | | | | | | + Addr Mask source configurable |3.2.2.9 |x| | | | | + Support static configuration of addr mask |3.2.2.9 |x| | | | | + Get addr mask dynamically during booting |3.2.2.9 | | |x| | | + Get addr via ICMP Addr Mask Request/Reply |3.2.2.9 | | |x| | | + Retransmit Addr Mask Req if no Reply |3.2.2.9 |x| | | | |3 + Assume default mask if no Reply |3.2.2.9 | |x| | | |3 + Update address mask from first Reply only |3.2.2.9 |x| | | | |3 + Reasonableness check on Addr Mask |3.2.2.9 | |x| | | | + Send unauthorized Addr Mask Reply msgs |3.2.2.9 | | | | |x| + Explicitly configured to be agent |3.2.2.9 |x| | | | | + Static config=> Addr-Mask-Authoritative flag |3.2.2.9 | |x| | | | + Broadcast Addr Mask Reply when init. |3.2.2.9 |x| | | | |3 + | | | | | | | +ROUTING OUTBOUND DATAGRAMS: | | | | | | | + Use address mask in local/remote decision |3.3.1.1 |x| | | | | + Operate with no gateways on conn network |3.3.1.1 |x| | | | | + Maintain "route cache" of next-hop gateways |3.3.1.2 |x| | | | | + Treat Host and Net Redirect the same |3.3.1.2 | |x| | | | + If no cache entry, use default gateway |3.3.1.2 |x| | | | | + Support multiple default gateways |3.3.1.2 |x| | | | | + Provide table of static routes |3.3.1.2 | | |x| | | + Flag: route overridable by Redirects |3.3.1.2 | | |x| | | + Key route cache on host, not net address |3.3.1.3 | | |x| | | + Include TOS in route cache |3.3.1.3 | |x| | | | + | | | | | | | + Able to detect failure of next-hop gateway |3.3.1.4 |x| | | | | + Assume route is good forever |3.3.1.4 | | | |x| | + + + +Internet Engineering Task Force [Page 74] + + + + +RFC1122 INTERNET LAYER October 1989 + + + Ping gateways continuously |3.3.1.4 | | | | |x| + Ping only when traffic being sent |3.3.1.4 |x| | | | | + Ping only when no positive indication |3.3.1.4 |x| | | | | + Higher and lower layers give advice |3.3.1.4 | |x| | | | + Switch from failed default g'way to another |3.3.1.5 |x| | | | | + Manual method of entering config info |3.3.1.6 |x| | | | | + | | | | | | | +REASSEMBLY and FRAGMENTATION: | | | | | | | + Able to reassemble incoming datagrams |3.3.2 |x| | | | | + At least 576 byte datagrams |3.3.2 |x| | | | | + EMTU_R configurable or indefinite |3.3.2 | |x| | | | + Transport layer able to learn MMS_R |3.3.2 |x| | | | | + Send ICMP Time Exceeded on reassembly timeout |3.3.2 |x| | | | | + Fixed reassembly timeout value |3.3.2 | |x| | | | + | | | | | | | + Pass MMS_S to higher layers |3.3.3 |x| | | | | + Local fragmentation of outgoing packets |3.3.3 | | |x| | | + Else don't send bigger than MMS_S |3.3.3 |x| | | | | + Send max 576 to off-net destination |3.3.3 | |x| | | | + All-Subnets-MTU configuration flag |3.3.3 | | |x| | | + | | | | | | | +MULTIHOMING: | | | | | | | + Reply with same addr as spec-dest addr |3.3.4.2 | |x| | | | + Allow application to choose local IP addr |3.3.4.2 |x| | | | | + Silently discard d'gram in "wrong" interface |3.3.4.2 | | |x| | | + Only send d'gram through "right" interface |3.3.4.2 | | |x| | |4 + | | | | | | | +SOURCE-ROUTE FORWARDING: | | | | | | | + Forward datagram with Source Route option |3.3.5 | | |x| | |1 + Obey corresponding gateway rules |3.3.5 |x| | | | |1 + Update TTL by gateway rules |3.3.5 |x| | | | |1 + Able to generate ICMP err code 4, 5 |3.3.5 |x| | | | |1 + IP src addr not local host |3.3.5 | | |x| | |1 + Update Timestamp, Record Route options |3.3.5 |x| | | | |1 + Configurable switch for non-local SRing |3.3.5 |x| | | | |1 + Defaults to OFF |3.3.5 |x| | | | |1 + Satisfy gwy access rules for non-local SRing |3.3.5 |x| | | | |1 + If not forward, send Dest Unreach (cd 5) |3.3.5 | |x| | | |2 + | | | | | | | +BROADCAST: | | | | | | | + Broadcast addr as IP source addr |3.2.1.3 | | | | |x| + Receive 0 or -1 broadcast formats OK |3.3.6 | |x| | | | + Config'ble option to send 0 or -1 b'cast |3.3.6 | | |x| | | + Default to -1 broadcast |3.3.6 | |x| | | | + Recognize all broadcast address formats |3.3.6 |x| | | | | + Use IP b'cast/m'cast addr in link-layer b'cast |3.3.6 |x| | | | | + Silently discard link-layer-only b'cast dg's |3.3.6 | |x| | | | + Use Limited Broadcast addr for connected net |3.3.6 | |x| | | | + + + +Internet Engineering Task Force [Page 75] + + + + +RFC1122 INTERNET LAYER October 1989 + + + | | | | | | | +MULTICAST: | | | | | | | + Support local IP multicasting (RFC-1112) |3.3.7 | |x| | | | + Support IGMP (RFC-1112) |3.3.7 | | |x| | | + Join all-hosts group at startup |3.3.7 | |x| | | | + Higher layers learn i'face m'cast capability |3.3.7 | |x| | | | + | | | | | | | +INTERFACE: | | | | | | | + Allow transport layer to use all IP mechanisms |3.4 |x| | | | | + Pass interface ident up to transport layer |3.4 |x| | | | | + Pass all IP options up to transport layer |3.4 |x| | | | | + Transport layer can send certain ICMP messages |3.4 |x| | | | | + Pass spec'd ICMP messages up to transp. layer |3.4 |x| | | | | + Include IP hdr+8 octets or more from orig. |3.4 |x| | | | | + Able to leap tall buildings at a single bound |3.5 | |x| | | | + +Footnotes: + +(1) Only if feature is implemented. + +(2) This requirement is overruled if datagram is an ICMP error message. + +(3) Only if feature is implemented and is configured "on". + +(4) Unless has embedded gateway functionality or is source routed. + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 76] + + + + +RFC1122 TRANSPORT LAYER -- UDP October 1989 + + +4. TRANSPORT PROTOCOLS + + 4.1 USER DATAGRAM PROTOCOL -- UDP + + 4.1.1 INTRODUCTION + + The User Datagram Protocol UDP [UDP:1] offers only a minimal + transport service -- non-guaranteed datagram delivery -- and + gives applications direct access to the datagram service of the + IP layer. UDP is used by applications that do not require the + level of service of TCP or that wish to use communications + services (e.g., multicast or broadcast delivery) not available + from TCP. + + UDP is almost a null protocol; the only services it provides + over IP are checksumming of data and multiplexing by port + number. Therefore, an application program running over UDP + must deal directly with end-to-end communication problems that + a connection-oriented protocol would have handled -- e.g., + retransmission for reliable delivery, packetization and + reassembly, flow control, congestion avoidance, etc., when + these are required. The fairly complex coupling between IP and + TCP will be mirrored in the coupling between UDP and many + applications using UDP. + + 4.1.2 PROTOCOL WALK-THROUGH + + There are no known errors in the specification of UDP. + + 4.1.3 SPECIFIC ISSUES + + 4.1.3.1 Ports + + UDP well-known ports follow the same rules as TCP well-known + ports; see Section 4.2.2.1 below. + + If a datagram arrives addressed to a UDP port for which + there is no pending LISTEN call, UDP SHOULD send an ICMP + Port Unreachable message. + + 4.1.3.2 IP Options + + UDP MUST pass any IP option that it receives from the IP + layer transparently to the application layer. + + An application MUST be able to specify IP options to be sent + in its UDP datagrams, and UDP MUST pass these options to the + IP layer. + + + +Internet Engineering Task Force [Page 77] + + + + +RFC1122 TRANSPORT LAYER -- UDP October 1989 + + + DISCUSSION: + At present, the only options that need be passed + through UDP are Source Route, Record Route, and Time + Stamp. However, new options may be defined in the + future, and UDP need not and should not make any + assumptions about the format or content of options it + passes to or from the application; an exception to this + might be an IP-layer security option. + + An application based on UDP will need to obtain a + source route from a request datagram and supply a + reversed route for sending the corresponding reply. + + 4.1.3.3 ICMP Messages + + UDP MUST pass to the application layer all ICMP error + messages that it receives from the IP layer. Conceptually + at least, this may be accomplished with an upcall to the + ERROR_REPORT routine (see Section 4.2.4.1). + + DISCUSSION: + Note that ICMP error messages resulting from sending a + UDP datagram are received asynchronously. A UDP-based + application that wants to receive ICMP error messages + is responsible for maintaining the state necessary to + demultiplex these messages when they arrive; for + example, the application may keep a pending receive + operation for this purpose. The application is also + responsible to avoid confusion from a delayed ICMP + error message resulting from an earlier use of the same + port(s). + + 4.1.3.4 UDP Checksums + + A host MUST implement the facility to generate and validate + UDP checksums. An application MAY optionally be able to + control whether a UDP checksum will be generated, but it + MUST default to checksumming on. + + If a UDP datagram is received with a checksum that is non- + zero and invalid, UDP MUST silently discard the datagram. + An application MAY optionally be able to control whether UDP + datagrams without checksums should be discarded or passed to + the application. + + DISCUSSION: + Some applications that normally run only across local + area networks have chosen to turn off UDP checksums for + + + +Internet Engineering Task Force [Page 78] + + + + +RFC1122 TRANSPORT LAYER -- UDP October 1989 + + + efficiency. As a result, numerous cases of undetected + errors have been reported. The advisability of ever + turning off UDP checksumming is very controversial. + + IMPLEMENTATION: + There is a common implementation error in UDP + checksums. Unlike the TCP checksum, the UDP checksum + is optional; the value zero is transmitted in the + checksum field of a UDP header to indicate the absence + of a checksum. If the transmitter really calculates a + UDP checksum of zero, it must transmit the checksum as + all 1's (65535). No special action is required at the + receiver, since zero and 65535 are equivalent in 1's + complement arithmetic. + + 4.1.3.5 UDP Multihoming + + When a UDP datagram is received, its specific-destination + address MUST be passed up to the application layer. + + An application program MUST be able to specify the IP source + address to be used for sending a UDP datagram or to leave it + unspecified (in which case the networking software will + choose an appropriate source address). There SHOULD be a + way to communicate the chosen source address up to the + application layer (e.g, so that the application can later + receive a reply datagram only from the corresponding + interface). + + DISCUSSION: + A request/response application that uses UDP should use + a source address for the response that is the same as + the specific destination address of the request. See + the "General Issues" section of [INTRO:1]. + + 4.1.3.6 Invalid Addresses + + A UDP datagram received with an invalid IP source address + (e.g., a broadcast or multicast address) must be discarded + by UDP or by the IP layer (see Section 3.2.1.3). + + When a host sends a UDP datagram, the source address MUST be + (one of) the IP address(es) of the host. + + 4.1.4 UDP/APPLICATION LAYER INTERFACE + + The application interface to UDP MUST provide the full services + of the IP/transport interface described in Section 3.4 of this + + + +Internet Engineering Task Force [Page 79] + + + + +RFC1122 TRANSPORT LAYER -- UDP October 1989 + + + document. Thus, an application using UDP needs the functions + of the GET_SRCADDR(), GET_MAXSIZES(), ADVISE_DELIVPROB(), and + RECV_ICMP() calls described in Section 3.4. For example, + GET_MAXSIZES() can be used to learn the effective maximum UDP + maximum datagram size for a particular {interface,remote + host,TOS} triplet. + + An application-layer program MUST be able to set the TTL and + TOS values as well as IP options for sending a UDP datagram, + and these values must be passed transparently to the IP layer. + UDP MAY pass the received TOS up to the application layer. + + 4.1.5 UDP REQUIREMENTS SUMMARY + + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-------------------------------------------------|--------|-|-|-|-|-|-- + | | | | | | | + UDP | | | | | | | +-------------------------------------------------|--------|-|-|-|-|-|-- + | | | | | | | +UDP send Port Unreachable |4.1.3.1 | |x| | | | + | | | | | | | +IP Options in UDP | | | | | | | + - Pass rcv'd IP options to applic layer |4.1.3.2 |x| | | | | + - Applic layer can specify IP options in Send |4.1.3.2 |x| | | | | + - UDP passes IP options down to IP layer |4.1.3.2 |x| | | | | + | | | | | | | +Pass ICMP msgs up to applic layer |4.1.3.3 |x| | | | | + | | | | | | | +UDP checksums: | | | | | | | + - Able to generate/check checksum |4.1.3.4 |x| | | | | + - Silently discard bad checksum |4.1.3.4 |x| | | | | + - Sender Option to not generate checksum |4.1.3.4 | | |x| | | + - Default is to checksum |4.1.3.4 |x| | | | | + - Receiver Option to require checksum |4.1.3.4 | | |x| | | + | | | | | | | +UDP Multihoming | | | | | | | + - Pass spec-dest addr to application |4.1.3.5 |x| | | | | + + + +Internet Engineering Task Force [Page 80] + + + + +RFC1122 TRANSPORT LAYER -- UDP October 1989 + + + - Applic layer can specify Local IP addr |4.1.3.5 |x| | | | | + - Applic layer specify wild Local IP addr |4.1.3.5 |x| | | | | + - Applic layer notified of Local IP addr used |4.1.3.5 | |x| | | | + | | | | | | | +Bad IP src addr silently discarded by UDP/IP |4.1.3.6 |x| | | | | +Only send valid IP source address |4.1.3.6 |x| | | | | +UDP Application Interface Services | | | | | | | +Full IP interface of 3.4 for application |4.1.4 |x| | | | | + - Able to spec TTL, TOS, IP opts when send dg |4.1.4 |x| | | | | + - Pass received TOS up to applic layer |4.1.4 | | |x| | | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 81] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + 4.2 TRANSMISSION CONTROL PROTOCOL -- TCP + + 4.2.1 INTRODUCTION + + The Transmission Control Protocol TCP [TCP:1] is the primary + virtual-circuit transport protocol for the Internet suite. TCP + provides reliable, in-sequence delivery of a full-duplex stream + of octets (8-bit bytes). TCP is used by those applications + needing reliable, connection-oriented transport service, e.g., + mail (SMTP), file transfer (FTP), and virtual terminal service + (Telnet); requirements for these application-layer protocols + are described in [INTRO:1]. + + 4.2.2 PROTOCOL WALK-THROUGH + + 4.2.2.1 Well-Known Ports: RFC-793 Section 2.7 + + DISCUSSION: + TCP reserves port numbers in the range 0-255 for + "well-known" ports, used to access services that are + standardized across the Internet. The remainder of the + port space can be freely allocated to application + processes. Current well-known port definitions are + listed in the RFC entitled "Assigned Numbers" + [INTRO:6]. A prerequisite for defining a new well- + known port is an RFC documenting the proposed service + in enough detail to allow new implementations. + + Some systems extend this notion by adding a third + subdivision of the TCP port space: reserved ports, + which are generally used for operating-system-specific + services. For example, reserved ports might fall + between 256 and some system-dependent upper limit. + Some systems further choose to protect well-known and + reserved ports by permitting only privileged users to + open TCP connections with those port values. This is + perfectly reasonable as long as the host does not + assume that all hosts protect their low-numbered ports + in this manner. + + 4.2.2.2 Use of Push: RFC-793 Section 2.8 + + When an application issues a series of SEND calls without + setting the PUSH flag, the TCP MAY aggregate the data + internally without sending it. Similarly, when a series of + segments is received without the PSH bit, a TCP MAY queue + the data internally without passing it to the receiving + application. + + + +Internet Engineering Task Force [Page 82] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + The PSH bit is not a record marker and is independent of + segment boundaries. The transmitter SHOULD collapse + successive PSH bits when it packetizes data, to send the + largest possible segment. + + A TCP MAY implement PUSH flags on SEND calls. If PUSH flags + are not implemented, then the sending TCP: (1) must not + buffer data indefinitely, and (2) MUST set the PSH bit in + the last buffered segment (i.e., when there is no more + queued data to be sent). + + The discussion in RFC-793 on pages 48, 50, and 74 + erroneously implies that a received PSH flag must be passed + to the application layer. Passing a received PSH flag to + the application layer is now OPTIONAL. + + An application program is logically required to set the PUSH + flag in a SEND call whenever it needs to force delivery of + the data to avoid a communication deadlock. However, a TCP + SHOULD send a maximum-sized segment whenever possible, to + improve performance (see Section 4.2.3.4). + + DISCUSSION: + When the PUSH flag is not implemented on SEND calls, + i.e., when the application/TCP interface uses a pure + streaming model, responsibility for aggregating any + tiny data fragments to form reasonable sized segments + is partially borne by the application layer. + + Generally, an interactive application protocol must set + the PUSH flag at least in the last SEND call in each + command or response sequence. A bulk transfer protocol + like FTP should set the PUSH flag on the last segment + of a file or when necessary to prevent buffer deadlock. + + At the receiver, the PSH bit forces buffered data to be + delivered to the application (even if less than a full + buffer has been received). Conversely, the lack of a + PSH bit can be used to avoid unnecessary wakeup calls + to the application process; this can be an important + performance optimization for large timesharing hosts. + Passing the PSH bit to the receiving application allows + an analogous optimization within the application. + + 4.2.2.3 Window Size: RFC-793 Section 3.1 + + The window size MUST be treated as an unsigned number, or + else large window sizes will appear like negative windows + + + +Internet Engineering Task Force [Page 83] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + and TCP will not work. It is RECOMMENDED that + implementations reserve 32-bit fields for the send and + receive window sizes in the connection record and do all + window computations with 32 bits. + + DISCUSSION: + It is known that the window field in the TCP header is + too small for high-speed, long-delay paths. + Experimental TCP options have been defined to extend + the window size; see for example [TCP:11]. In + anticipation of the adoption of such an extension, TCP + implementors should treat windows as 32 bits. + + 4.2.2.4 Urgent Pointer: RFC-793 Section 3.1 + + The second sentence is in error: the urgent pointer points + to the sequence number of the LAST octet (not LAST+1) in a + sequence of urgent data. The description on page 56 (last + sentence) is correct. + + A TCP MUST support a sequence of urgent data of any length. + + A TCP MUST inform the application layer asynchronously + whenever it receives an Urgent pointer and there was + previously no pending urgent data, or whenever the Urgent + pointer advances in the data stream. There MUST be a way + for the application to learn how much urgent data remains to + be read from the connection, or at least to determine + whether or not more urgent data remains to be read. + + DISCUSSION: + Although the Urgent mechanism may be used for any + application, it is normally used to send "interrupt"- + type commands to a Telnet program (see "Using Telnet + Synch Sequence" section in [INTRO:1]). + + The asynchronous or "out-of-band" notification will + allow the application to go into "urgent mode", reading + data from the TCP connection. This allows control + commands to be sent to an application whose normal + input buffers are full of unprocessed data. + + IMPLEMENTATION: + The generic ERROR-REPORT() upcall described in Section + 4.2.4.1 is a possible mechanism for informing the + application of the arrival of urgent data. + + + + + +Internet Engineering Task Force [Page 84] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + 4.2.2.5 TCP Options: RFC-793 Section 3.1 + + A TCP MUST be able to receive a TCP option in any segment. + A TCP MUST ignore without error any TCP option it does not + implement, assuming that the option has a length field (all + TCP options defined in the future will have length fields). + TCP MUST be prepared to handle an illegal option length + (e.g., zero) without crashing; a suggested procedure is to + reset the connection and log the reason. + + 4.2.2.6 Maximum Segment Size Option: RFC-793 Section 3.1 + + TCP MUST implement both sending and receiving the Maximum + Segment Size option [TCP:4]. + + TCP SHOULD send an MSS (Maximum Segment Size) option in + every SYN segment when its receive MSS differs from the + default 536, and MAY send it always. + + If an MSS option is not received at connection setup, TCP + MUST assume a default send MSS of 536 (576-40) [TCP:4]. + + The maximum size of a segment that TCP really sends, the + "effective send MSS," MUST be the smaller of the send MSS + (which reflects the available reassembly buffer size at the + remote host) and the largest size permitted by the IP layer: + + Eff.snd.MSS = + + min(SendMSS+20, MMS_S) - TCPhdrsize - IPoptionsize + + where: + + * SendMSS is the MSS value received from the remote host, + or the default 536 if no MSS option is received. + + * MMS_S is the maximum size for a transport-layer message + that TCP may send. + + * TCPhdrsize is the size of the TCP header; this is + normally 20, but may be larger if TCP options are to be + sent. + + * IPoptionsize is the size of any IP options that TCP + will pass to the IP layer with the current message. + + + The MSS value to be sent in an MSS option must be less than + + + +Internet Engineering Task Force [Page 85] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + or equal to: + + MMS_R - 20 + + where MMS_R is the maximum size for a transport-layer + message that can be received (and reassembled). TCP obtains + MMS_R and MMS_S from the IP layer; see the generic call + GET_MAXSIZES in Section 3.4. + + DISCUSSION: + The choice of TCP segment size has a strong effect on + performance. Larger segments increase throughput by + amortizing header size and per-datagram processing + overhead over more data bytes; however, if the packet + is so large that it causes IP fragmentation, efficiency + drops sharply if any fragments are lost [IP:9]. + + Some TCP implementations send an MSS option only if the + destination host is on a non-connected network. + However, in general the TCP layer may not have the + appropriate information to make this decision, so it is + preferable to leave to the IP layer the task of + determining a suitable MTU for the Internet path. We + therefore recommend that TCP always send the option (if + not 536) and that the IP layer determine MMS_R as + specified in 3.3.3 and 3.4. A proposed IP-layer + mechanism to measure the MTU would then modify the IP + layer without changing TCP. + + 4.2.2.7 TCP Checksum: RFC-793 Section 3.1 + + Unlike the UDP checksum (see Section 4.1.3.4), the TCP + checksum is never optional. The sender MUST generate it and + the receiver MUST check it. + + 4.2.2.8 TCP Connection State Diagram: RFC-793 Section 3.2, + page 23 + + There are several problems with this diagram: + + (a) The arrow from SYN-SENT to SYN-RCVD should be labeled + with "snd SYN,ACK", to agree with the text on page 68 + and with Figure 8. + + (b) There could be an arrow from SYN-RCVD state to LISTEN + state, conditioned on receiving a RST after a passive + open (see text page 70). + + + + +Internet Engineering Task Force [Page 86] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + (c) It is possible to go directly from FIN-WAIT-1 to the + TIME-WAIT state (see page 75 of the spec). + + + 4.2.2.9 Initial Sequence Number Selection: RFC-793 Section + 3.3, page 27 + + A TCP MUST use the specified clock-driven selection of + initial sequence numbers. + + 4.2.2.10 Simultaneous Open Attempts: RFC-793 Section 3.4, page + 32 + + There is an error in Figure 8: the packet on line 7 should + be identical to the packet on line 5. + + A TCP MUST support simultaneous open attempts. + + DISCUSSION: + It sometimes surprises implementors that if two + applications attempt to simultaneously connect to each + other, only one connection is generated instead of two. + This was an intentional design decision; don't try to + "fix" it. + + 4.2.2.11 Recovery from Old Duplicate SYN: RFC-793 Section 3.4, + page 33 + + Note that a TCP implementation MUST keep track of whether a + connection has reached SYN_RCVD state as the result of a + passive OPEN or an active OPEN. + + 4.2.2.12 RST Segment: RFC-793 Section 3.4 + + A TCP SHOULD allow a received RST segment to include data. + + DISCUSSION + It has been suggested that a RST segment could contain + ASCII text that encoded and explained the cause of the + RST. No standard has yet been established for such + data. + + 4.2.2.13 Closing a Connection: RFC-793 Section 3.5 + + A TCP connection may terminate in two ways: (1) the normal + TCP close sequence using a FIN handshake, and (2) an "abort" + in which one or more RST segments are sent and the + connection state is immediately discarded. If a TCP + + + +Internet Engineering Task Force [Page 87] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + connection is closed by the remote site, the local + application MUST be informed whether it closed normally or + was aborted. + + The normal TCP close sequence delivers buffered data + reliably in both directions. Since the two directions of a + TCP connection are closed independently, it is possible for + a connection to be "half closed," i.e., closed in only one + direction, and a host is permitted to continue sending data + in the open direction on a half-closed connection. + + A host MAY implement a "half-duplex" TCP close sequence, so + that an application that has called CLOSE cannot continue to + read data from the connection. If such a host issues a + CLOSE call while received data is still pending in TCP, or + if new data is received after CLOSE is called, its TCP + SHOULD send a RST to show that data was lost. + + When a connection is closed actively, it MUST linger in + TIME-WAIT state for a time 2xMSL (Maximum Segment Lifetime). + However, it MAY accept a new SYN from the remote TCP to + reopen the connection directly from TIME-WAIT state, if it: + + (1) assigns its initial sequence number for the new + connection to be larger than the largest sequence + number it used on the previous connection incarnation, + and + + (2) returns to TIME-WAIT state if the SYN turns out to be + an old duplicate. + + + DISCUSSION: + TCP's full-duplex data-preserving close is a feature + that is not included in the analogous ISO transport + protocol TP4. + + Some systems have not implemented half-closed + connections, presumably because they do not fit into + the I/O model of their particular operating system. On + these systems, once an application has called CLOSE, it + can no longer read input data from the connection; this + is referred to as a "half-duplex" TCP close sequence. + + The graceful close algorithm of TCP requires that the + connection state remain defined on (at least) one end + of the connection, for a timeout period of 2xMSL, i.e., + 4 minutes. During this period, the (remote socket, + + + +Internet Engineering Task Force [Page 88] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + local socket) pair that defines the connection is busy + and cannot be reused. To shorten the time that a given + port pair is tied up, some TCPs allow a new SYN to be + accepted in TIME-WAIT state. + + 4.2.2.14 Data Communication: RFC-793 Section 3.7, page 40 + + Since RFC-793 was written, there has been extensive work on + TCP algorithms to achieve efficient data communication. + Later sections of the present document describe required and + recommended TCP algorithms to determine when to send data + (Section 4.2.3.4), when to send an acknowledgment (Section + 4.2.3.2), and when to update the window (Section 4.2.3.3). + + DISCUSSION: + One important performance issue is "Silly Window + Syndrome" or "SWS" [TCP:5], a stable pattern of small + incremental window movements resulting in extremely + poor TCP performance. Algorithms to avoid SWS are + described below for both the sending side (Section + 4.2.3.4) and the receiving side (Section 4.2.3.3). + + In brief, SWS is caused by the receiver advancing the + right window edge whenever it has any new buffer space + available to receive data and by the sender using any + incremental window, no matter how small, to send more + data [TCP:5]. The result can be a stable pattern of + sending tiny data segments, even though both sender and + receiver have a large total buffer space for the + connection. SWS can only occur during the transmission + of a large amount of data; if the connection goes + quiescent, the problem will disappear. It is caused by + typical straightforward implementation of window + management, but the sender and receiver algorithms + given below will avoid it. + + Another important TCP performance issue is that some + applications, especially remote login to character-at- + a-time hosts, tend to send streams of one-octet data + segments. To avoid deadlocks, every TCP SEND call from + such applications must be "pushed", either explicitly + by the application or else implicitly by TCP. The + result may be a stream of TCP segments that contain one + data octet each, which makes very inefficient use of + the Internet and contributes to Internet congestion. + The Nagle Algorithm described in Section 4.2.3.4 + provides a simple and effective solution to this + problem. It does have the effect of clumping + + + +Internet Engineering Task Force [Page 89] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + characters over Telnet connections; this may initially + surprise users accustomed to single-character echo, but + user acceptance has not been a problem. + + Note that the Nagle algorithm and the send SWS + avoidance algorithm play complementary roles in + improving performance. The Nagle algorithm discourages + sending tiny segments when the data to be sent + increases in small increments, while the SWS avoidance + algorithm discourages small segments resulting from the + right window edge advancing in small increments. + + A careless implementation can send two or more + acknowledgment segments per data segment received. For + example, suppose the receiver acknowledges every data + segment immediately. When the application program + subsequently consumes the data and increases the + available receive buffer space again, the receiver may + send a second acknowledgment segment to update the + window at the sender. The extreme case occurs with + single-character segments on TCP connections using the + Telnet protocol for remote login service. Some + implementations have been observed in which each + incoming 1-character segment generates three return + segments: (1) the acknowledgment, (2) a one byte + increase in the window, and (3) the echoed character, + respectively. + + 4.2.2.15 Retransmission Timeout: RFC-793 Section 3.7, page 41 + + The algorithm suggested in RFC-793 for calculating the + retransmission timeout is now known to be inadequate; see + Section 4.2.3.1 below. + + Recent work by Jacobson [TCP:7] on Internet congestion and + TCP retransmission stability has produced a transmission + algorithm combining "slow start" with "congestion + avoidance". A TCP MUST implement this algorithm. + + If a retransmitted packet is identical to the original + packet (which implies not only that the data boundaries have + not changed, but also that the window and acknowledgment + fields of the header have not changed), then the same IP + Identification field MAY be used (see Section 3.2.1.5). + + IMPLEMENTATION: + Some TCP implementors have chosen to "packetize" the + data stream, i.e., to pick segment boundaries when + + + +Internet Engineering Task Force [Page 90] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + segments are originally sent and to queue these + segments in a "retransmission queue" until they are + acknowledged. Another design (which may be simpler) is + to defer packetizing until each time data is + transmitted or retransmitted, so there will be no + segment retransmission queue. + + In an implementation with a segment retransmission + queue, TCP performance may be enhanced by repacketizing + the segments awaiting acknowledgment when the first + retransmission timeout occurs. That is, the + outstanding segments that fitted would be combined into + one maximum-sized segment, with a new IP Identification + value. The TCP would then retain this combined segment + in the retransmit queue until it was acknowledged. + However, if the first two segments in the + retransmission queue totalled more than one maximum- + sized segment, the TCP would retransmit only the first + segment using the original IP Identification field. + + 4.2.2.16 Managing the Window: RFC-793 Section 3.7, page 41 + + A TCP receiver SHOULD NOT shrink the window, i.e., move the + right window edge to the left. However, a sending TCP MUST + be robust against window shrinking, which may cause the + "useable window" (see Section 4.2.3.4) to become negative. + + If this happens, the sender SHOULD NOT send new data, but + SHOULD retransmit normally the old unacknowledged data + between SND.UNA and SND.UNA+SND.WND. The sender MAY also + retransmit old data beyond SND.UNA+SND.WND, but SHOULD NOT + time out the connection if data beyond the right window edge + is not acknowledged. If the window shrinks to zero, the TCP + MUST probe it in the standard way (see next Section). + + DISCUSSION: + Many TCP implementations become confused if the window + shrinks from the right after data has been sent into a + larger window. Note that TCP has a heuristic to select + the latest window update despite possible datagram + reordering; as a result, it may ignore a window update + with a smaller window than previously offered if + neither the sequence number nor the acknowledgment + number is increased. + + + + + + + +Internet Engineering Task Force [Page 91] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + 4.2.2.17 Probing Zero Windows: RFC-793 Section 3.7, page 42 + + Probing of zero (offered) windows MUST be supported. + + A TCP MAY keep its offered receive window closed + indefinitely. As long as the receiving TCP continues to + send acknowledgments in response to the probe segments, the + sending TCP MUST allow the connection to stay open. + + DISCUSSION: + It is extremely important to remember that ACK + (acknowledgment) segments that contain no data are not + reliably transmitted by TCP. If zero window probing is + not supported, a connection may hang forever when an + ACK segment that re-opens the window is lost. + + The delay in opening a zero window generally occurs + when the receiving application stops taking data from + its TCP. For example, consider a printer daemon + application, stopped because the printer ran out of + paper. + + The transmitting host SHOULD send the first zero-window + probe when a zero window has existed for the retransmission + timeout period (see Section 4.2.2.15), and SHOULD increase + exponentially the interval between successive probes. + + DISCUSSION: + This procedure minimizes delay if the zero-window + condition is due to a lost ACK segment containing a + window-opening update. Exponential backoff is + recommended, possibly with some maximum interval not + specified here. This procedure is similar to that of + the retransmission algorithm, and it may be possible to + combine the two procedures in the implementation. + + 4.2.2.18 Passive OPEN Calls: RFC-793 Section 3.8 + + Every passive OPEN call either creates a new connection + record in LISTEN state, or it returns an error; it MUST NOT + affect any previously created connection record. + + A TCP that supports multiple concurrent users MUST provide + an OPEN call that will functionally allow an application to + LISTEN on a port while a connection block with the same + local port is in SYN-SENT or SYN-RECEIVED state. + + DISCUSSION: + + + +Internet Engineering Task Force [Page 92] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + Some applications (e.g., SMTP servers) may need to + handle multiple connection attempts at about the same + time. The probability of a connection attempt failing + is reduced by giving the application some means of + listening for a new connection at the same time that an + earlier connection attempt is going through the three- + way handshake. + + IMPLEMENTATION: + Acceptable implementations of concurrent opens may + permit multiple passive OPEN calls, or they may allow + "cloning" of LISTEN-state connections from a single + passive OPEN call. + + 4.2.2.19 Time to Live: RFC-793 Section 3.9, page 52 + + RFC-793 specified that TCP was to request the IP layer to + send TCP segments with TTL = 60. This is obsolete; the TTL + value used to send TCP segments MUST be configurable. See + Section 3.2.1.7 for discussion. + + 4.2.2.20 Event Processing: RFC-793 Section 3.9 + + While it is not strictly required, a TCP SHOULD be capable + of queueing out-of-order TCP segments. Change the "may" in + the last sentence of the first paragraph on page 70 to + "should". + + DISCUSSION: + Some small-host implementations have omitted segment + queueing because of limited buffer space. This + omission may be expected to adversely affect TCP + throughput, since loss of a single segment causes all + later segments to appear to be "out of sequence". + + In general, the processing of received segments MUST be + implemented to aggregate ACK segments whenever possible. + For example, if the TCP is processing a series of queued + segments, it MUST process them all before sending any ACK + segments. + + Here are some detailed error corrections and notes on the + Event Processing section of RFC-793. + + (a) CLOSE Call, CLOSE-WAIT state, p. 61: enter LAST-ACK + state, not CLOSING. + + (b) LISTEN state, check for SYN (pp. 65, 66): With a SYN + + + +Internet Engineering Task Force [Page 93] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + bit, if the security/compartment or the precedence is + wrong for the segment, a reset is sent. The wrong form + of reset is shown in the text; it should be: + + + + + (c) SYN-SENT state, Check for SYN, p. 68: When the + connection enters ESTABLISHED state, the following + variables must be set: + SND.WND <- SEG.WND + SND.WL1 <- SEG.SEQ + SND.WL2 <- SEG.ACK + + + (d) Check security and precedence, p. 71: The first heading + "ESTABLISHED STATE" should really be a list of all + states other than SYN-RECEIVED: ESTABLISHED, FIN-WAIT- + 1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, and + TIME-WAIT. + + (e) Check SYN bit, p. 71: "In SYN-RECEIVED state and if + the connection was initiated with a passive OPEN, then + return this connection to the LISTEN state and return. + Otherwise...". + + (f) Check ACK field, SYN-RECEIVED state, p. 72: When the + connection enters ESTABLISHED state, the variables + listed in (c) must be set. + + (g) Check ACK field, ESTABLISHED state, p. 72: The ACK is a + duplicate if SEG.ACK =< SND.UNA (the = was omitted). + Similarly, the window should be updated if: SND.UNA =< + SEG.ACK =< SND.NXT. + + (h) USER TIMEOUT, p. 77: + + It would be better to notify the application of the + timeout rather than letting TCP force the connection + closed. However, see also Section 4.2.3.5. + + + 4.2.2.21 Acknowledging Queued Segments: RFC-793 Section 3.9 + + A TCP MAY send an ACK segment acknowledging RCV.NXT when a + valid segment arrives that is in the window but not at the + left window edge. + + + + +Internet Engineering Task Force [Page 94] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + DISCUSSION: + RFC-793 (see page 74) was ambiguous about whether or + not an ACK segment should be sent when an out-of-order + segment was received, i.e., when SEG.SEQ was unequal to + RCV.NXT. + + One reason for ACKing out-of-order segments might be to + support an experimental algorithm known as "fast + retransmit". With this algorithm, the sender uses the + "redundant" ACK's to deduce that a segment has been + lost before the retransmission timer has expired. It + counts the number of times an ACK has been received + with the same value of SEG.ACK and with the same right + window edge. If more than a threshold number of such + ACK's is received, then the segment containing the + octets starting at SEG.ACK is assumed to have been lost + and is retransmitted, without awaiting a timeout. The + threshold is chosen to compensate for the maximum + likely segment reordering in the Internet. There is + not yet enough experience with the fast retransmit + algorithm to determine how useful it is. + + 4.2.3 SPECIFIC ISSUES + + 4.2.3.1 Retransmission Timeout Calculation + + A host TCP MUST implement Karn's algorithm and Jacobson's + algorithm for computing the retransmission timeout ("RTO"). + + o Jacobson's algorithm for computing the smoothed round- + trip ("RTT") time incorporates a simple measure of the + variance [TCP:7]. + + o Karn's algorithm for selecting RTT measurements ensures + that ambiguous round-trip times will not corrupt the + calculation of the smoothed round-trip time [TCP:6]. + + This implementation also MUST include "exponential backoff" + for successive RTO values for the same segment. + Retransmission of SYN segments SHOULD use the same algorithm + as data segments. + + DISCUSSION: + There were two known problems with the RTO calculations + specified in RFC-793. First, the accurate measurement + of RTTs is difficult when there are retransmissions. + Second, the algorithm to compute the smoothed round- + trip time is inadequate [TCP:7], because it incorrectly + + + +Internet Engineering Task Force [Page 95] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + assumed that the variance in RTT values would be small + and constant. These problems were solved by Karn's and + Jacobson's algorithm, respectively. + + The performance increase resulting from the use of + these improvements varies from noticeable to dramatic. + Jacobson's algorithm for incorporating the measured RTT + variance is especially important on a low-speed link, + where the natural variation of packet sizes causes a + large variation in RTT. One vendor found link + utilization on a 9.6kb line went from 10% to 90% as a + result of implementing Jacobson's variance algorithm in + TCP. + + The following values SHOULD be used to initialize the + estimation parameters for a new connection: + + (a) RTT = 0 seconds. + + (b) RTO = 3 seconds. (The smoothed variance is to be + initialized to the value that will result in this RTO). + + The recommended upper and lower bounds on the RTO are known + to be inadequate on large internets. The lower bound SHOULD + be measured in fractions of a second (to accommodate high + speed LANs) and the upper bound should be 2*MSL, i.e., 240 + seconds. + + DISCUSSION: + Experience has shown that these initialization values + are reasonable, and that in any case the Karn and + Jacobson algorithms make TCP behavior reasonably + insensitive to the initial parameter choices. + + 4.2.3.2 When to Send an ACK Segment + + A host that is receiving a stream of TCP data segments can + increase efficiency in both the Internet and the hosts by + sending fewer than one ACK (acknowledgment) segment per data + segment received; this is known as a "delayed ACK" [TCP:5]. + + A TCP SHOULD implement a delayed ACK, but an ACK should not + be excessively delayed; in particular, the delay MUST be + less than 0.5 seconds, and in a stream of full-sized + segments there SHOULD be an ACK for at least every second + segment. + + DISCUSSION: + + + +Internet Engineering Task Force [Page 96] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + A delayed ACK gives the application an opportunity to + update the window and perhaps to send an immediate + response. In particular, in the case of character-mode + remote login, a delayed ACK can reduce the number of + segments sent by the server by a factor of 3 (ACK, + window update, and echo character all combined in one + segment). + + In addition, on some large multi-user hosts, a delayed + ACK can substantially reduce protocol processing + overhead by reducing the total number of packets to be + processed [TCP:5]. However, excessive delays on ACK's + can disturb the round-trip timing and packet "clocking" + algorithms [TCP:7]. + + 4.2.3.3 When to Send a Window Update + + A TCP MUST include a SWS avoidance algorithm in the receiver + [TCP:5]. + + IMPLEMENTATION: + The receiver's SWS avoidance algorithm determines when + the right window edge may be advanced; this is + customarily known as "updating the window". This + algorithm combines with the delayed ACK algorithm (see + Section 4.2.3.2) to determine when an ACK segment + containing the current window will really be sent to + the receiver. We use the notation of RFC-793; see + Figures 4 and 5 in that document. + + The solution to receiver SWS is to avoid advancing the + right window edge RCV.NXT+RCV.WND in small increments, + even if data is received from the network in small + segments. + + Suppose the total receive buffer space is RCV.BUFF. At + any given moment, RCV.USER octets of this total may be + tied up with data that has been received and + acknowledged but which the user process has not yet + consumed. When the connection is quiescent, RCV.WND = + RCV.BUFF and RCV.USER = 0. + + Keeping the right window edge fixed as data arrives and + is acknowledged requires that the receiver offer less + than its full buffer space, i.e., the receiver must + specify a RCV.WND that keeps RCV.NXT+RCV.WND constant + as RCV.NXT increases. Thus, the total buffer space + RCV.BUFF is generally divided into three parts: + + + +Internet Engineering Task Force [Page 97] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + + |<------- RCV.BUFF ---------------->| + 1 2 3 + ----|---------|------------------|------|---- + RCV.NXT ^ + (Fixed) + + 1 - RCV.USER = data received but not yet consumed; + 2 - RCV.WND = space advertised to sender; + 3 - Reduction = space available but not yet + advertised. + + + The suggested SWS avoidance algorithm for the receiver + is to keep RCV.NXT+RCV.WND fixed until the reduction + satisfies: + + RCV.BUFF - RCV.USER - RCV.WND >= + + min( Fr * RCV.BUFF, Eff.snd.MSS ) + + where Fr is a fraction whose recommended value is 1/2, + and Eff.snd.MSS is the effective send MSS for the + connection (see Section 4.2.2.6). When the inequality + is satisfied, RCV.WND is set to RCV.BUFF-RCV.USER. + + Note that the general effect of this algorithm is to + advance RCV.WND in increments of Eff.snd.MSS (for + realistic receive buffers: Eff.snd.MSS < RCV.BUFF/2). + Note also that the receiver must use its own + Eff.snd.MSS, assuming it is the same as the sender's. + + 4.2.3.4 When to Send Data + + A TCP MUST include a SWS avoidance algorithm in the sender. + + A TCP SHOULD implement the Nagle Algorithm [TCP:9] to + coalesce short segments. However, there MUST be a way for + an application to disable the Nagle algorithm on an + individual connection. In all cases, sending data is also + subject to the limitation imposed by the Slow Start + algorithm (Section 4.2.2.15). + + DISCUSSION: + The Nagle algorithm is generally as follows: + + If there is unacknowledged data (i.e., SND.NXT > + SND.UNA), then the sending TCP buffers all user + + + +Internet Engineering Task Force [Page 98] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + data (regardless of the PSH bit), until the + outstanding data has been acknowledged or until + the TCP can send a full-sized segment (Eff.snd.MSS + bytes; see Section 4.2.2.6). + + Some applications (e.g., real-time display window + updates) require that the Nagle algorithm be turned + off, so small data segments can be streamed out at the + maximum rate. + + IMPLEMENTATION: + The sender's SWS avoidance algorithm is more difficult + than the receivers's, because the sender does not know + (directly) the receiver's total buffer space RCV.BUFF. + An approach which has been found to work well is for + the sender to calculate Max(SND.WND), the maximum send + window it has seen so far on the connection, and to use + this value as an estimate of RCV.BUFF. Unfortunately, + this can only be an estimate; the receiver may at any + time reduce the size of RCV.BUFF. To avoid a resulting + deadlock, it is necessary to have a timeout to force + transmission of data, overriding the SWS avoidance + algorithm. In practice, this timeout should seldom + occur. + + The "useable window" [TCP:5] is: + + U = SND.UNA + SND.WND - SND.NXT + + i.e., the offered window less the amount of data sent + but not acknowledged. If D is the amount of data + queued in the sending TCP but not yet sent, then the + following set of rules is recommended. + + Send data: + + (1) if a maximum-sized segment can be sent, i.e, if: + + min(D,U) >= Eff.snd.MSS; + + + (2) or if the data is pushed and all queued data can + be sent now, i.e., if: + + [SND.NXT = SND.UNA and] PUSHED and D <= U + + (the bracketed condition is imposed by the Nagle + algorithm); + + + +Internet Engineering Task Force [Page 99] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + (3) or if at least a fraction Fs of the maximum window + can be sent, i.e., if: + + [SND.NXT = SND.UNA and] + + min(D.U) >= Fs * Max(SND.WND); + + + (4) or if data is PUSHed and the override timeout + occurs. + + Here Fs is a fraction whose recommended value is 1/2. + The override timeout should be in the range 0.1 - 1.0 + seconds. It may be convenient to combine this timer + with the timer used to probe zero windows (Section + 4.2.2.17). + + Finally, note that the SWS avoidance algorithm just + specified is to be used instead of the sender-side + algorithm contained in [TCP:5]. + + 4.2.3.5 TCP Connection Failures + + Excessive retransmission of the same segment by TCP + indicates some failure of the remote host or the Internet + path. This failure may be of short or long duration. The + following procedure MUST be used to handle excessive + retransmissions of data segments [IP:11]: + + (a) There are two thresholds R1 and R2 measuring the amount + of retransmission that has occurred for the same + segment. R1 and R2 might be measured in time units or + as a count of retransmissions. + + (b) When the number of transmissions of the same segment + reaches or exceeds threshold R1, pass negative advice + (see Section 3.3.1.4) to the IP layer, to trigger + dead-gateway diagnosis. + + (c) When the number of transmissions of the same segment + reaches a threshold R2 greater than R1, close the + connection. + + (d) An application MUST be able to set the value for R2 for + a particular connection. For example, an interactive + application might set R2 to "infinity," giving the user + control over when to disconnect. + + + + +Internet Engineering Task Force [Page 100] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + (d) TCP SHOULD inform the application of the delivery + problem (unless such information has been disabled by + the application; see Section 4.2.4.1), when R1 is + reached and before R2. This will allow a remote login + (User Telnet) application program to inform the user, + for example. + + The value of R1 SHOULD correspond to at least 3 + retransmissions, at the current RTO. The value of R2 SHOULD + correspond to at least 100 seconds. + + An attempt to open a TCP connection could fail with + excessive retransmissions of the SYN segment or by receipt + of a RST segment or an ICMP Port Unreachable. SYN + retransmissions MUST be handled in the general way just + described for data retransmissions, including notification + of the application layer. + + However, the values of R1 and R2 may be different for SYN + and data segments. In particular, R2 for a SYN segment MUST + be set large enough to provide retransmission of the segment + for at least 3 minutes. The application can close the + connection (i.e., give up on the open attempt) sooner, of + course. + + DISCUSSION: + Some Internet paths have significant setup times, and + the number of such paths is likely to increase in the + future. + + 4.2.3.6 TCP Keep-Alives + + Implementors MAY include "keep-alives" in their TCP + implementations, although this practice is not universally + accepted. If keep-alives are included, the application MUST + be able to turn them on or off for each TCP connection, and + they MUST default to off. + + Keep-alive packets MUST only be sent when no data or + acknowledgement packets have been received for the + connection within an interval. This interval MUST be + configurable and MUST default to no less than two hours. + + It is extremely important to remember that ACK segments that + contain no data are not reliably transmitted by TCP. + Consequently, if a keep-alive mechanism is implemented it + MUST NOT interpret failure to respond to any specific probe + as a dead connection. + + + +Internet Engineering Task Force [Page 101] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + An implementation SHOULD send a keep-alive segment with no + data; however, it MAY be configurable to send a keep-alive + segment containing one garbage octet, for compatibility with + erroneous TCP implementations. + + DISCUSSION: + A "keep-alive" mechanism periodically probes the other + end of a connection when the connection is otherwise + idle, even when there is no data to be sent. The TCP + specification does not include a keep-alive mechanism + because it could: (1) cause perfectly good connections + to break during transient Internet failures; (2) + consume unnecessary bandwidth ("if no one is using the + connection, who cares if it is still good?"); and (3) + cost money for an Internet path that charges for + packets. + + Some TCP implementations, however, have included a + keep-alive mechanism. To confirm that an idle + connection is still active, these implementations send + a probe segment designed to elicit a response from the + peer TCP. Such a segment generally contains SEG.SEQ = + SND.NXT-1 and may or may not contain one garbage octet + of data. Note that on a quiet connection SND.NXT = + RCV.NXT, so that this SEG.SEQ will be outside the + window. Therefore, the probe causes the receiver to + return an acknowledgment segment, confirming that the + connection is still live. If the peer has dropped the + connection due to a network partition or a crash, it + will respond with a RST instead of an acknowledgment + segment. + + Unfortunately, some misbehaved TCP implementations fail + to respond to a segment with SEG.SEQ = SND.NXT-1 unless + the segment contains data. Alternatively, an + implementation could determine whether a peer responded + correctly to keep-alive packets with no garbage data + octet. + + A TCP keep-alive mechanism should only be invoked in + server applications that might otherwise hang + indefinitely and consume resources unnecessarily if a + client crashes or aborts a connection during a network + failure. + + + + + + + +Internet Engineering Task Force [Page 102] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + 4.2.3.7 TCP Multihoming + + If an application on a multihomed host does not specify the + local IP address when actively opening a TCP connection, + then the TCP MUST ask the IP layer to select a local IP + address before sending the (first) SYN. See the function + GET_SRCADDR() in Section 3.4. + + At all other times, a previous segment has either been sent + or received on this connection, and TCP MUST use the same + local address is used that was used in those previous + segments. + + 4.2.3.8 IP Options + + When received options are passed up to TCP from the IP + layer, TCP MUST ignore options that it does not understand. + + A TCP MAY support the Time Stamp and Record Route options. + + An application MUST be able to specify a source route when + it actively opens a TCP connection, and this MUST take + precedence over a source route received in a datagram. + + When a TCP connection is OPENed passively and a packet + arrives with a completed IP Source Route option (containing + a return route), TCP MUST save the return route and use it + for all segments sent on this connection. If a different + source route arrives in a later segment, the later + definition SHOULD override the earlier one. + + 4.2.3.9 ICMP Messages + + TCP MUST act on an ICMP error message passed up from the IP + layer, directing it to the connection that created the + error. The necessary demultiplexing information can be + found in the IP header contained within the ICMP message. + + o Source Quench + + TCP MUST react to a Source Quench by slowing + transmission on the connection. The RECOMMENDED + procedure is for a Source Quench to trigger a "slow + start," as if a retransmission timeout had occurred. + + o Destination Unreachable -- codes 0, 1, 5 + + Since these Unreachable messages indicate soft error + + + +Internet Engineering Task Force [Page 103] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + conditions, TCP MUST NOT abort the connection, and it + SHOULD make the information available to the + application. + + DISCUSSION: + TCP could report the soft error condition directly + to the application layer with an upcall to the + ERROR_REPORT routine, or it could merely note the + message and report it to the application only when + and if the TCP connection times out. + + o Destination Unreachable -- codes 2-4 + + These are hard error conditions, so TCP SHOULD abort + the connection. + + o Time Exceeded -- codes 0, 1 + + This should be handled the same way as Destination + Unreachable codes 0, 1, 5 (see above). + + o Parameter Problem + + This should be handled the same way as Destination + Unreachable codes 0, 1, 5 (see above). + + + 4.2.3.10 Remote Address Validation + + A TCP implementation MUST reject as an error a local OPEN + call for an invalid remote IP address (e.g., a broadcast or + multicast address). + + An incoming SYN with an invalid source address must be + ignored either by TCP or by the IP layer (see Section + 3.2.1.3). + + A TCP implementation MUST silently discard an incoming SYN + segment that is addressed to a broadcast or multicast + address. + + 4.2.3.11 TCP Traffic Patterns + + IMPLEMENTATION: + The TCP protocol specification [TCP:1] gives the + implementor much freedom in designing the algorithms + that control the message flow over the connection -- + packetizing, managing the window, sending + + + +Internet Engineering Task Force [Page 104] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + acknowledgments, etc. These design decisions are + difficult because a TCP must adapt to a wide range of + traffic patterns. Experience has shown that a TCP + implementor needs to verify the design on two extreme + traffic patterns: + + o Single-character Segments + + Even if the sender is using the Nagle Algorithm, + when a TCP connection carries remote login traffic + across a low-delay LAN the receiver will generally + get a stream of single-character segments. If + remote terminal echo mode is in effect, the + receiver's system will generally echo each + character as it is received. + + o Bulk Transfer + + When TCP is used for bulk transfer, the data + stream should be made up (almost) entirely of + segments of the size of the effective MSS. + Although TCP uses a sequence number space with + byte (octet) granularity, in bulk-transfer mode + its operation should be as if TCP used a sequence + space that counted only segments. + + Experience has furthermore shown that a single TCP can + effectively and efficiently handle these two extremes. + + The most important tool for verifying a new TCP + implementation is a packet trace program. There is a + large volume of experience showing the importance of + tracing a variety of traffic patterns with other TCP + implementations and studying the results carefully. + + + 4.2.3.12 Efficiency + + IMPLEMENTATION: + Extensive experience has led to the following + suggestions for efficient implementation of TCP: + + (a) Don't Copy Data + + In bulk data transfer, the primary CPU-intensive + tasks are copying data from one place to another + and checksumming the data. It is vital to + minimize the number of copies of TCP data. Since + + + +Internet Engineering Task Force [Page 105] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + the ultimate speed limitation may be fetching data + across the memory bus, it may be useful to combine + the copy with checksumming, doing both with a + single memory fetch. + + (b) Hand-Craft the Checksum Routine + + A good TCP checksumming routine is typically two + to five times faster than a simple and direct + implementation of the definition. Great care and + clever coding are often required and advisable to + make the checksumming code "blazing fast". See + [TCP:10]. + + (c) Code for the Common Case + + TCP protocol processing can be complicated, but + for most segments there are only a few simple + decisions to be made. Per-segment processing will + be greatly speeded up by coding the main line to + minimize the number of decisions in the most + common case. + + + 4.2.4 TCP/APPLICATION LAYER INTERFACE + + 4.2.4.1 Asynchronous Reports + + There MUST be a mechanism for reporting soft TCP error + conditions to the application. Generically, we assume this + takes the form of an application-supplied ERROR_REPORT + routine that may be upcalled [INTRO:7] asynchronously from + the transport layer: + + ERROR_REPORT(local connection name, reason, subreason) + + The precise encoding of the reason and subreason parameters + is not specified here. However, the conditions that are + reported asynchronously to the application MUST include: + + * ICMP error message arrived (see 4.2.3.9) + + * Excessive retransmissions (see 4.2.3.5) + + * Urgent pointer advance (see 4.2.2.4). + + However, an application program that does not want to + receive such ERROR_REPORT calls SHOULD be able to + + + +Internet Engineering Task Force [Page 106] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + effectively disable these calls. + + DISCUSSION: + These error reports generally reflect soft errors that + can be ignored without harm by many applications. It + has been suggested that these error report calls should + default to "disabled," but this is not required. + + 4.2.4.2 Type-of-Service + + The application layer MUST be able to specify the Type-of- + Service (TOS) for segments that are sent on a connection. + It not required, but the application SHOULD be able to + change the TOS during the connection lifetime. TCP SHOULD + pass the current TOS value without change to the IP layer, + when it sends segments on the connection. + + The TOS will be specified independently in each direction on + the connection, so that the receiver application will + specify the TOS used for ACK segments. + + TCP MAY pass the most recently received TOS up to the + application. + + DISCUSSION + Some applications (e.g., SMTP) change the nature of + their communication during the lifetime of a + connection, and therefore would like to change the TOS + specification. + + Note also that the OPEN call specified in RFC-793 + includes a parameter ("options") in which the caller + can specify IP options such as source route, record + route, or timestamp. + + 4.2.4.3 Flush Call + + Some TCP implementations have included a FLUSH call, which + will empty the TCP send queue of any data for which the user + has issued SEND calls but which is still to the right of the + current send window. That is, it flushes as much queued + send data as possible without losing sequence number + synchronization. This is useful for implementing the "abort + output" function of Telnet. + + + + + + + +Internet Engineering Task Force [Page 107] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + 4.2.4.4 Multihoming + + The user interface outlined in sections 2.7 and 3.8 of RFC- + 793 needs to be extended for multihoming. The OPEN call + MUST have an optional parameter: + + OPEN( ... [local IP address,] ... ) + + to allow the specification of the local IP address. + + DISCUSSION: + Some TCP-based applications need to specify the local + IP address to be used to open a particular connection; + FTP is an example. + + IMPLEMENTATION: + A passive OPEN call with a specified "local IP address" + parameter will await an incoming connection request to + that address. If the parameter is unspecified, a + passive OPEN will await an incoming connection request + to any local IP address, and then bind the local IP + address of the connection to the particular address + that is used. + + For an active OPEN call, a specified "local IP address" + parameter will be used for opening the connection. If + the parameter is unspecified, the networking software + will choose an appropriate local IP address (see + Section 3.3.4.2) for the connection + + 4.2.5 TCP REQUIREMENT SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-------------------------------------------------|--------|-|-|-|-|-|-- + | | | | | | | +Push flag | | | | | | | + Aggregate or queue un-pushed data |4.2.2.2 | | |x| | | + Sender collapse successive PSH flags |4.2.2.2 | |x| | | | + SEND call can specify PUSH |4.2.2.2 | | |x| | | + + + +Internet Engineering Task Force [Page 108] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + If cannot: sender buffer indefinitely |4.2.2.2 | | | | |x| + If cannot: PSH last segment |4.2.2.2 |x| | | | | + Notify receiving ALP of PSH |4.2.2.2 | | |x| | |1 + Send max size segment when possible |4.2.2.2 | |x| | | | + | | | | | | | +Window | | | | | | | + Treat as unsigned number |4.2.2.3 |x| | | | | + Handle as 32-bit number |4.2.2.3 | |x| | | | + Shrink window from right |4.2.2.16| | | |x| | + Robust against shrinking window |4.2.2.16|x| | | | | + Receiver's window closed indefinitely |4.2.2.17| | |x| | | + Sender probe zero window |4.2.2.17|x| | | | | + First probe after RTO |4.2.2.17| |x| | | | + Exponential backoff |4.2.2.17| |x| | | | + Allow window stay zero indefinitely |4.2.2.17|x| | | | | + Sender timeout OK conn with zero wind |4.2.2.17| | | | |x| + | | | | | | | +Urgent Data | | | | | | | + Pointer points to last octet |4.2.2.4 |x| | | | | + Arbitrary length urgent data sequence |4.2.2.4 |x| | | | | + Inform ALP asynchronously of urgent data |4.2.2.4 |x| | | | |1 + ALP can learn if/how much urgent data Q'd |4.2.2.4 |x| | | | |1 + | | | | | | | +TCP Options | | | | | | | + Receive TCP option in any segment |4.2.2.5 |x| | | | | + Ignore unsupported options |4.2.2.5 |x| | | | | + Cope with illegal option length |4.2.2.5 |x| | | | | + Implement sending & receiving MSS option |4.2.2.6 |x| | | | | + Send MSS option unless 536 |4.2.2.6 | |x| | | | + Send MSS option always |4.2.2.6 | | |x| | | + Send-MSS default is 536 |4.2.2.6 |x| | | | | + Calculate effective send seg size |4.2.2.6 |x| | | | | + | | | | | | | +TCP Checksums | | | | | | | + Sender compute checksum |4.2.2.7 |x| | | | | + Receiver check checksum |4.2.2.7 |x| | | | | + | | | | | | | +Use clock-driven ISN selection |4.2.2.9 |x| | | | | + | | | | | | | +Opening Connections | | | | | | | + Support simultaneous open attempts |4.2.2.10|x| | | | | + SYN-RCVD remembers last state |4.2.2.11|x| | | | | + Passive Open call interfere with others |4.2.2.18| | | | |x| + Function: simultan. LISTENs for same port |4.2.2.18|x| | | | | + Ask IP for src address for SYN if necc. |4.2.3.7 |x| | | | | + Otherwise, use local addr of conn. |4.2.3.7 |x| | | | | + OPEN to broadcast/multicast IP Address |4.2.3.14| | | | |x| + Silently discard seg to bcast/mcast addr |4.2.3.14|x| | | | | + + + +Internet Engineering Task Force [Page 109] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + | | | | | | | +Closing Connections | | | | | | | + RST can contain data |4.2.2.12| |x| | | | + Inform application of aborted conn |4.2.2.13|x| | | | | + Half-duplex close connections |4.2.2.13| | |x| | | + Send RST to indicate data lost |4.2.2.13| |x| | | | + In TIME-WAIT state for 2xMSL seconds |4.2.2.13|x| | | | | + Accept SYN from TIME-WAIT state |4.2.2.13| | |x| | | + | | | | | | | +Retransmissions | | | | | | | + Jacobson Slow Start algorithm |4.2.2.15|x| | | | | + Jacobson Congestion-Avoidance algorithm |4.2.2.15|x| | | | | + Retransmit with same IP ident |4.2.2.15| | |x| | | + Karn's algorithm |4.2.3.1 |x| | | | | + Jacobson's RTO estimation alg. |4.2.3.1 |x| | | | | + Exponential backoff |4.2.3.1 |x| | | | | + SYN RTO calc same as data |4.2.3.1 | |x| | | | + Recommended initial values and bounds |4.2.3.1 | |x| | | | + | | | | | | | +Generating ACK's: | | | | | | | + Queue out-of-order segments |4.2.2.20| |x| | | | + Process all Q'd before send ACK |4.2.2.20|x| | | | | + Send ACK for out-of-order segment |4.2.2.21| | |x| | | + Delayed ACK's |4.2.3.2 | |x| | | | + Delay < 0.5 seconds |4.2.3.2 |x| | | | | + Every 2nd full-sized segment ACK'd |4.2.3.2 |x| | | | | + Receiver SWS-Avoidance Algorithm |4.2.3.3 |x| | | | | + | | | | | | | +Sending data | | | | | | | + Configurable TTL |4.2.2.19|x| | | | | + Sender SWS-Avoidance Algorithm |4.2.3.4 |x| | | | | + Nagle algorithm |4.2.3.4 | |x| | | | + Application can disable Nagle algorithm |4.2.3.4 |x| | | | | + | | | | | | | +Connection Failures: | | | | | | | + Negative advice to IP on R1 retxs |4.2.3.5 |x| | | | | + Close connection on R2 retxs |4.2.3.5 |x| | | | | + ALP can set R2 |4.2.3.5 |x| | | | |1 + Inform ALP of R1<=retxs inform ALP |4.2.3.9 | |x| | | | + Dest. Unreach (0,1,5) => abort conn |4.2.3.9 | | | | |x| + Dest. Unreach (2-4) => abort conn |4.2.3.9 | |x| | | | + Source Quench => slow start |4.2.3.9 | |x| | | | + Time Exceeded => tell ALP, don't abort |4.2.3.9 | |x| | | | + Param Problem => tell ALP, don't abort |4.2.3.9 | |x| | | | + | | | | | | | +Address Validation | | | | | | | + Reject OPEN call to invalid IP address |4.2.3.10|x| | | | | + Reject SYN from invalid IP address |4.2.3.10|x| | | | | + Silently discard SYN to bcast/mcast addr |4.2.3.10|x| | | | | + | | | | | | | +TCP/ALP Interface Services | | | | | | | + Error Report mechanism |4.2.4.1 |x| | | | | + ALP can disable Error Report Routine |4.2.4.1 | |x| | | | + ALP can specify TOS for sending |4.2.4.2 |x| | | | | + Passed unchanged to IP |4.2.4.2 | |x| | | | + ALP can change TOS during connection |4.2.4.2 | |x| | | | + Pass received TOS up to ALP |4.2.4.2 | | |x| | | + FLUSH call |4.2.4.3 | | |x| | | + Optional local IP addr parm. in OPEN |4.2.4.4 |x| | | | | +-------------------------------------------------|--------|-|-|-|-|-|-- +-------------------------------------------------|--------|-|-|-|-|-|-- + +FOOTNOTES: + +(1) "ALP" means Application-Layer program. + + + + + + + + + + +Internet Engineering Task Force [Page 111] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + +5. REFERENCES + +INTRODUCTORY REFERENCES + + +[INTRO:1] "Requirements for Internet Hosts -- Application and Support," + IETF Host Requirements Working Group, R. Braden, Ed., RFC-1123, + October 1989. + +[INTRO:2] "Requirements for Internet Gateways," R. Braden and J. + Postel, RFC-1009, June 1987. + +[INTRO:3] "DDN Protocol Handbook," NIC-50004, NIC-50005, NIC-50006, + (three volumes), SRI International, December 1985. + +[INTRO:4] "Official Internet Protocols," J. Reynolds and J. Postel, + RFC-1011, May 1987. + + This document is republished periodically with new RFC numbers; the + latest version must be used. + +[INTRO:5] "Protocol Document Order Information," O. Jacobsen and J. + Postel, RFC-980, March 1986. + +[INTRO:6] "Assigned Numbers," J. Reynolds and J. Postel, RFC-1010, May + 1987. + + This document is republished periodically with new RFC numbers; the + latest version must be used. + +[INTRO:7] "Modularity and Efficiency in Protocol Implementations," D. + Clark, RFC-817, July 1982. + +[INTRO:8] "The Structuring of Systems Using Upcalls," D. Clark, 10th ACM + SOSP, Orcas Island, Washington, December 1985. + + +Secondary References: + + +[INTRO:9] "A Protocol for Packet Network Intercommunication," V. Cerf + and R. Kahn, IEEE Transactions on Communication, May 1974. + +[INTRO:10] "The ARPA Internet Protocol," J. Postel, C. Sunshine, and D. + Cohen, Computer Networks, Vol. 5, No. 4, July 1981. + +[INTRO:11] "The DARPA Internet Protocol Suite," B. Leiner, J. Postel, + R. Cole and D. Mills, Proceedings INFOCOM 85, IEEE, Washington DC, + + + +Internet Engineering Task Force [Page 112] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + March 1985. Also in: IEEE Communications Magazine, March 1985. + Also available as ISI-RS-85-153. + +[INTRO:12] "Final Text of DIS8473, Protocol for Providing the + Connectionless Mode Network Service," ANSI, published as RFC-994, + March 1986. + +[INTRO:13] "End System to Intermediate System Routing Exchange + Protocol," ANSI X3S3.3, published as RFC-995, April 1986. + + +LINK LAYER REFERENCES + + +[LINK:1] "Trailer Encapsulations," S. Leffler and M. Karels, RFC-893, + April 1984. + +[LINK:2] "An Ethernet Address Resolution Protocol," D. Plummer, RFC-826, + November 1982. + +[LINK:3] "A Standard for the Transmission of IP Datagrams over Ethernet + Networks," C. Hornig, RFC-894, April 1984. + +[LINK:4] "A Standard for the Transmission of IP Datagrams over IEEE 802 + "Networks," J. Postel and J. Reynolds, RFC-1042, February 1988. + + This RFC contains a great deal of information of importance to + Internet implementers planning to use IEEE 802 networks. + + +IP LAYER REFERENCES + + +[IP:1] "Internet Protocol (IP)," J. Postel, RFC-791, September 1981. + +[IP:2] "Internet Control Message Protocol (ICMP)," J. Postel, RFC-792, + September 1981. + +[IP:3] "Internet Standard Subnetting Procedure," J. Mogul and J. Postel, + RFC-950, August 1985. + +[IP:4] "Host Extensions for IP Multicasting," S. Deering, RFC-1112, + August 1989. + +[IP:5] "Military Standard Internet Protocol," MIL-STD-1777, Department + of Defense, August 1983. + + This specification, as amended by RFC-963, is intended to describe + + + +Internet Engineering Task Force [Page 113] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + + the Internet Protocol but has some serious omissions (e.g., the + mandatory subnet extension [IP:3] and the optional multicasting + extension [IP:4]). It is also out of date. If there is a + conflict, RFC-791, RFC-792, and RFC-950 must be taken as + authoritative, while the present document is authoritative over + all. + +[IP:6] "Some Problems with the Specification of the Military Standard + Internet Protocol," D. Sidhu, RFC-963, November 1985. + +[IP:7] "The TCP Maximum Segment Size and Related Topics," J. Postel, + RFC-879, November 1983. + + Discusses and clarifies the relationship between the TCP Maximum + Segment Size option and the IP datagram size. + +[IP:8] "Internet Protocol Security Options," B. Schofield, RFC-1108, + October 1989. + +[IP:9] "Fragmentation Considered Harmful," C. Kent and J. Mogul, ACM + SIGCOMM-87, August 1987. Published as ACM Comp Comm Review, Vol. + 17, no. 5. + + This useful paper discusses the problems created by Internet + fragmentation and presents alternative solutions. + +[IP:10] "IP Datagram Reassembly Algorithms," D. Clark, RFC-815, July + 1982. + + This and the following paper should be read by every implementor. + +[IP:11] "Fault Isolation and Recovery," D. Clark, RFC-816, July 1982. + +SECONDARY IP REFERENCES: + + +[IP:12] "Broadcasting Internet Datagrams in the Presence of Subnets," J. + Mogul, RFC-922, October 1984. + +[IP:13] "Name, Addresses, Ports, and Routes," D. Clark, RFC-814, July + 1982. + +[IP:14] "Something a Host Could Do with Source Quench: The Source Quench + Introduced Delay (SQUID)," W. Prue and J. Postel, RFC-1016, July + 1987. + + This RFC first described directed broadcast addresses. However, + the bulk of the RFC is concerned with gateways, not hosts. + + + +Internet Engineering Task Force [Page 114] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + +UDP REFERENCES: + + +[UDP:1] "User Datagram Protocol," J. Postel, RFC-768, August 1980. + + +TCP REFERENCES: + + +[TCP:1] "Transmission Control Protocol," J. Postel, RFC-793, September + 1981. + + +[TCP:2] "Transmission Control Protocol," MIL-STD-1778, US Department of + Defense, August 1984. + + This specification as amended by RFC-964 is intended to describe + the same protocol as RFC-793 [TCP:1]. If there is a conflict, + RFC-793 takes precedence, and the present document is authoritative + over both. + + +[TCP:3] "Some Problems with the Specification of the Military Standard + Transmission Control Protocol," D. Sidhu and T. Blumer, RFC-964, + November 1985. + + +[TCP:4] "The TCP Maximum Segment Size and Related Topics," J. Postel, + RFC-879, November 1983. + + +[TCP:5] "Window and Acknowledgment Strategy in TCP," D. Clark, RFC-813, + July 1982. + + +[TCP:6] "Round Trip Time Estimation," P. Karn & C. Partridge, ACM + SIGCOMM-87, August 1987. + + +[TCP:7] "Congestion Avoidance and Control," V. Jacobson, ACM SIGCOMM-88, + August 1988. + + +SECONDARY TCP REFERENCES: + + +[TCP:8] "Modularity and Efficiency in Protocol Implementation," D. + Clark, RFC-817, July 1982. + + + +Internet Engineering Task Force [Page 115] + + + + +RFC1122 TRANSPORT LAYER -- TCP October 1989 + + +[TCP:9] "Congestion Control in IP/TCP," J. Nagle, RFC-896, January 1984. + + +[TCP:10] "Computing the Internet Checksum," R. Braden, D. Borman, and C. + Partridge, RFC-1071, September 1988. + + +[TCP:11] "TCP Extensions for Long-Delay Paths," V. Jacobson & R. Braden, + RFC-1072, October 1988. + + +Security Considerations + + There are many security issues in the communication layers of host + software, but a full discussion is beyond the scope of this RFC. + + The Internet architecture generally provides little protection + against spoofing of IP source addresses, so any security mechanism + that is based upon verifying the IP source address of a datagram + should be treated with suspicion. However, in restricted + environments some source-address checking may be possible. For + example, there might be a secure LAN whose gateway to the rest of the + Internet discarded any incoming datagram with a source address that + spoofed the LAN address. In this case, a host on the LAN could use + the source address to test for local vs. remote source. This problem + is complicated by source routing, and some have suggested that + source-routed datagram forwarding by hosts (see Section 3.3.5) should + be outlawed for security reasons. + + Security-related issues are mentioned in sections concerning the IP + Security option (Section 3.2.1.8), the ICMP Parameter Problem message + (Section 3.2.2.5), IP options in UDP datagrams (Section 4.1.3.2), and + reserved TCP ports (Section 4.2.2.1). + +Author's Address + + Robert Braden + USC/Information Sciences Institute + 4676 Admiralty Way + Marina del Rey, CA 90292-6695 + + Phone: (213) 822 1511 + + EMail: Braden@ISI.EDU + + + + + + + +Internet Engineering Task Force [Page 116] + diff --git a/ext/picotcp/RFC/rfc1123.txt b/ext/picotcp/RFC/rfc1123.txt new file mode 100644 index 0000000..51cdf83 --- /dev/null +++ b/ext/picotcp/RFC/rfc1123.txt @@ -0,0 +1,5782 @@ + + + + + + +Network Working Group Internet Engineering Task Force +Request for Comments: 1123 R. Braden, Editor + October 1989 + + + Requirements for Internet Hosts -- Application and Support + +Status of This Memo + + This RFC is an official specification for the Internet community. It + incorporates by reference, amends, corrects, and supplements the + primary protocol standards documents relating to hosts. Distribution + of this document is unlimited. + +Summary + + This RFC is one of a pair that defines and discusses the requirements + for Internet host software. This RFC covers the application and + support protocols; its companion RFC-1122 covers the communication + protocol layers: link layer, IP layer, and transport layer. + + + + Table of Contents + + + + + 1. INTRODUCTION ............................................... 5 + 1.1 The Internet Architecture .............................. 6 + 1.2 General Considerations ................................. 6 + 1.2.1 Continuing Internet Evolution ..................... 6 + 1.2.2 Robustness Principle .............................. 7 + 1.2.3 Error Logging ..................................... 8 + 1.2.4 Configuration ..................................... 8 + 1.3 Reading this Document .................................. 10 + 1.3.1 Organization ...................................... 10 + 1.3.2 Requirements ...................................... 10 + 1.3.3 Terminology ....................................... 11 + 1.4 Acknowledgments ........................................ 12 + + 2. GENERAL ISSUES ............................................. 13 + 2.1 Host Names and Numbers ................................. 13 + 2.2 Using Domain Name Service .............................. 13 + 2.3 Applications on Multihomed hosts ....................... 14 + 2.4 Type-of-Service ........................................ 14 + 2.5 GENERAL APPLICATION REQUIREMENTS SUMMARY ............... 15 + + + + +Internet Engineering Task Force [Page 1] + + + + +RFC1123 INTRODUCTION October 1989 + + + 3. REMOTE LOGIN -- TELNET PROTOCOL ............................ 16 + 3.1 INTRODUCTION ........................................... 16 + 3.2 PROTOCOL WALK-THROUGH .................................. 16 + 3.2.1 Option Negotiation ................................ 16 + 3.2.2 Telnet Go-Ahead Function .......................... 16 + 3.2.3 Control Functions ................................. 17 + 3.2.4 Telnet "Synch" Signal ............................. 18 + 3.2.5 NVT Printer and Keyboard .......................... 19 + 3.2.6 Telnet Command Structure .......................... 20 + 3.2.7 Telnet Binary Option .............................. 20 + 3.2.8 Telnet Terminal-Type Option ....................... 20 + 3.3 SPECIFIC ISSUES ........................................ 21 + 3.3.1 Telnet End-of-Line Convention ..................... 21 + 3.3.2 Data Entry Terminals .............................. 23 + 3.3.3 Option Requirements ............................... 24 + 3.3.4 Option Initiation ................................. 24 + 3.3.5 Telnet Linemode Option ............................ 25 + 3.4 TELNET/USER INTERFACE .................................. 25 + 3.4.1 Character Set Transparency ........................ 25 + 3.4.2 Telnet Commands ................................... 26 + 3.4.3 TCP Connection Errors ............................. 26 + 3.4.4 Non-Default Telnet Contact Port ................... 26 + 3.4.5 Flushing Output ................................... 26 + 3.5. TELNET REQUIREMENTS SUMMARY ........................... 27 + + 4. FILE TRANSFER .............................................. 29 + 4.1 FILE TRANSFER PROTOCOL -- FTP .......................... 29 + 4.1.1 INTRODUCTION ...................................... 29 + 4.1.2. PROTOCOL WALK-THROUGH ............................ 29 + 4.1.2.1 LOCAL Type ................................... 29 + 4.1.2.2 Telnet Format Control ........................ 30 + 4.1.2.3 Page Structure ............................... 30 + 4.1.2.4 Data Structure Transformations ............... 30 + 4.1.2.5 Data Connection Management ................... 31 + 4.1.2.6 PASV Command ................................. 31 + 4.1.2.7 LIST and NLST Commands ....................... 31 + 4.1.2.8 SITE Command ................................. 32 + 4.1.2.9 STOU Command ................................. 32 + 4.1.2.10 Telnet End-of-line Code ..................... 32 + 4.1.2.11 FTP Replies ................................. 33 + 4.1.2.12 Connections ................................. 34 + 4.1.2.13 Minimum Implementation; RFC-959 Section ..... 34 + 4.1.3 SPECIFIC ISSUES ................................... 35 + 4.1.3.1 Non-standard Command Verbs ................... 35 + 4.1.3.2 Idle Timeout ................................. 36 + 4.1.3.3 Concurrency of Data and Control .............. 36 + 4.1.3.4 FTP Restart Mechanism ........................ 36 + 4.1.4 FTP/USER INTERFACE ................................ 39 + + + +Internet Engineering Task Force [Page 2] + + + + +RFC1123 INTRODUCTION October 1989 + + + 4.1.4.1 Pathname Specification ....................... 39 + 4.1.4.2 "QUOTE" Command .............................. 40 + 4.1.4.3 Displaying Replies to User ................... 40 + 4.1.4.4 Maintaining Synchronization .................. 40 + 4.1.5 FTP REQUIREMENTS SUMMARY ......................... 41 + 4.2 TRIVIAL FILE TRANSFER PROTOCOL -- TFTP ................. 44 + 4.2.1 INTRODUCTION ...................................... 44 + 4.2.2 PROTOCOL WALK-THROUGH ............................. 44 + 4.2.2.1 Transfer Modes ............................... 44 + 4.2.2.2 UDP Header ................................... 44 + 4.2.3 SPECIFIC ISSUES ................................... 44 + 4.2.3.1 Sorcerer's Apprentice Syndrome ............... 44 + 4.2.3.2 Timeout Algorithms ........................... 46 + 4.2.3.3 Extensions ................................... 46 + 4.2.3.4 Access Control ............................... 46 + 4.2.3.5 Broadcast Request ............................ 46 + 4.2.4 TFTP REQUIREMENTS SUMMARY ......................... 47 + + 5. ELECTRONIC MAIL -- SMTP and RFC-822 ........................ 48 + 5.1 INTRODUCTION ........................................... 48 + 5.2 PROTOCOL WALK-THROUGH .................................. 48 + 5.2.1 The SMTP Model .................................... 48 + 5.2.2 Canonicalization .................................. 49 + 5.2.3 VRFY and EXPN Commands ............................ 50 + 5.2.4 SEND, SOML, and SAML Commands ..................... 50 + 5.2.5 HELO Command ...................................... 50 + 5.2.6 Mail Relay ........................................ 51 + 5.2.7 RCPT Command ...................................... 52 + 5.2.8 DATA Command ...................................... 53 + 5.2.9 Command Syntax .................................... 54 + 5.2.10 SMTP Replies ..................................... 54 + 5.2.11 Transparency ..................................... 55 + 5.2.12 WKS Use in MX Processing ......................... 55 + 5.2.13 RFC-822 Message Specification .................... 55 + 5.2.14 RFC-822 Date and Time Specification .............. 55 + 5.2.15 RFC-822 Syntax Change ............................ 56 + 5.2.16 RFC-822 Local-part .............................. 56 + 5.2.17 Domain Literals .................................. 57 + 5.2.18 Common Address Formatting Errors ................. 58 + 5.2.19 Explicit Source Routes ........................... 58 + 5.3 SPECIFIC ISSUES ........................................ 59 + 5.3.1 SMTP Queueing Strategies .......................... 59 + 5.3.1.1 Sending Strategy .............................. 59 + 5.3.1.2 Receiving strategy ........................... 61 + 5.3.2 Timeouts in SMTP .................................. 61 + 5.3.3 Reliable Mail Receipt ............................. 63 + 5.3.4 Reliable Mail Transmission ........................ 63 + 5.3.5 Domain Name Support ............................... 65 + + + +Internet Engineering Task Force [Page 3] + + + + +RFC1123 INTRODUCTION October 1989 + + + 5.3.6 Mailing Lists and Aliases ......................... 65 + 5.3.7 Mail Gatewaying ................................... 66 + 5.3.8 Maximum Message Size .............................. 68 + 5.4 SMTP REQUIREMENTS SUMMARY .............................. 69 + + 6. SUPPORT SERVICES ............................................ 72 + 6.1 DOMAIN NAME TRANSLATION ................................. 72 + 6.1.1 INTRODUCTION ....................................... 72 + 6.1.2 PROTOCOL WALK-THROUGH ............................. 72 + 6.1.2.1 Resource Records with Zero TTL ............... 73 + 6.1.2.2 QCLASS Values ................................ 73 + 6.1.2.3 Unused Fields ................................ 73 + 6.1.2.4 Compression .................................. 73 + 6.1.2.5 Misusing Configuration Info .................. 73 + 6.1.3 SPECIFIC ISSUES ................................... 74 + 6.1.3.1 Resolver Implementation ...................... 74 + 6.1.3.2 Transport Protocols .......................... 75 + 6.1.3.3 Efficient Resource Usage ..................... 77 + 6.1.3.4 Multihomed Hosts ............................. 78 + 6.1.3.5 Extensibility ................................ 79 + 6.1.3.6 Status of RR Types ........................... 79 + 6.1.3.7 Robustness ................................... 80 + 6.1.3.8 Local Host Table ............................. 80 + 6.1.4 DNS USER INTERFACE ................................ 81 + 6.1.4.1 DNS Administration ........................... 81 + 6.1.4.2 DNS User Interface ........................... 81 + 6.1.4.3 Interface Abbreviation Facilities ............. 82 + 6.1.5 DOMAIN NAME SYSTEM REQUIREMENTS SUMMARY ........... 84 + 6.2 HOST INITIALIZATION .................................... 87 + 6.2.1 INTRODUCTION ...................................... 87 + 6.2.2 REQUIREMENTS ...................................... 87 + 6.2.2.1 Dynamic Configuration ........................ 87 + 6.2.2.2 Loading Phase ................................ 89 + 6.3 REMOTE MANAGEMENT ...................................... 90 + 6.3.1 INTRODUCTION ...................................... 90 + 6.3.2 PROTOCOL WALK-THROUGH ............................. 90 + 6.3.3 MANAGEMENT REQUIREMENTS SUMMARY ................... 92 + + 7. REFERENCES ................................................. 93 + + + + + + + + + + + + +Internet Engineering Task Force [Page 4] + + + + +RFC1123 INTRODUCTION October 1989 + + +1. INTRODUCTION + + This document is one of a pair that defines and discusses the + requirements for host system implementations of the Internet protocol + suite. This RFC covers the applications layer and support protocols. + Its companion RFC, "Requirements for Internet Hosts -- Communications + Layers" [INTRO:1] covers the lower layer protocols: transport layer, + IP layer, and link layer. + + These documents are intended to provide guidance for vendors, + implementors, and users of Internet communication software. They + represent the consensus of a large body of technical experience and + wisdom, contributed by members of the Internet research and vendor + communities. + + This RFC enumerates standard protocols that a host connected to the + Internet must use, and it incorporates by reference the RFCs and + other documents describing the current specifications for these + protocols. It corrects errors in the referenced documents and adds + additional discussion and guidance for an implementor. + + For each protocol, this document also contains an explicit set of + requirements, recommendations, and options. The reader must + understand that the list of requirements in this document is + incomplete by itself; the complete set of requirements for an + Internet host is primarily defined in the standard protocol + specification documents, with the corrections, amendments, and + supplements contained in this RFC. + + A good-faith implementation of the protocols that was produced after + careful reading of the RFC's and with some interaction with the + Internet technical community, and that followed good communications + software engineering practices, should differ from the requirements + of this document in only minor ways. Thus, in many cases, the + "requirements" in this RFC are already stated or implied in the + standard protocol documents, so that their inclusion here is, in a + sense, redundant. However, they were included because some past + implementation has made the wrong choice, causing problems of + interoperability, performance, and/or robustness. + + This document includes discussion and explanation of many of the + requirements and recommendations. A simple list of requirements + would be dangerous, because: + + o Some required features are more important than others, and some + features are optional. + + o There may be valid reasons why particular vendor products that + + + +Internet Engineering Task Force [Page 5] + + + + +RFC1123 INTRODUCTION October 1989 + + + are designed for restricted contexts might choose to use + different specifications. + + However, the specifications of this document must be followed to meet + the general goal of arbitrary host interoperation across the + diversity and complexity of the Internet system. Although most + current implementations fail to meet these requirements in various + ways, some minor and some major, this specification is the ideal + towards which we need to move. + + These requirements are based on the current level of Internet + architecture. This document will be updated as required to provide + additional clarifications or to include additional information in + those areas in which specifications are still evolving. + + This introductory section begins with general advice to host software + vendors, and then gives some guidance on reading the rest of the + document. Section 2 contains general requirements that may be + applicable to all application and support protocols. Sections 3, 4, + and 5 contain the requirements on protocols for the three major + applications: Telnet, file transfer, and electronic mail, + respectively. Section 6 covers the support applications: the domain + name system, system initialization, and management. Finally, all + references will be found in Section 7. + + 1.1 The Internet Architecture + + For a brief introduction to the Internet architecture from a host + viewpoint, see Section 1.1 of [INTRO:1]. That section also + contains recommended references for general background on the + Internet architecture. + + 1.2 General Considerations + + There are two important lessons that vendors of Internet host + software have learned and which a new vendor should consider + seriously. + + 1.2.1 Continuing Internet Evolution + + The enormous growth of the Internet has revealed problems of + management and scaling in a large datagram-based packet + communication system. These problems are being addressed, and + as a result there will be continuing evolution of the + specifications described in this document. These changes will + be carefully planned and controlled, since there is extensive + participation in this planning by the vendors and by the + organizations responsible for operations of the networks. + + + +Internet Engineering Task Force [Page 6] + + + + +RFC1123 INTRODUCTION October 1989 + + + Development, evolution, and revision are characteristic of + computer network protocols today, and this situation will + persist for some years. A vendor who develops computer + communication software for the Internet protocol suite (or any + other protocol suite!) and then fails to maintain and update + that software for changing specifications is going to leave a + trail of unhappy customers. The Internet is a large + communication network, and the users are in constant contact + through it. Experience has shown that knowledge of + deficiencies in vendor software propagates quickly through the + Internet technical community. + + 1.2.2 Robustness Principle + + At every layer of the protocols, there is a general rule whose + application can lead to enormous benefits in robustness and + interoperability: + + "Be liberal in what you accept, and + conservative in what you send" + + Software should be written to deal with every conceivable + error, no matter how unlikely; sooner or later a packet will + come in with that particular combination of errors and + attributes, and unless the software is prepared, chaos can + ensue. In general, it is best to assume that the network is + filled with malevolent entities that will send in packets + designed to have the worst possible effect. This assumption + will lead to suitable protective design, although the most + serious problems in the Internet have been caused by + unenvisaged mechanisms triggered by low-probability events; + mere human malice would never have taken so devious a course! + + Adaptability to change must be designed into all levels of + Internet host software. As a simple example, consider a + protocol specification that contains an enumeration of values + for a particular header field -- e.g., a type field, a port + number, or an error code; this enumeration must be assumed to + be incomplete. Thus, if a protocol specification defines four + possible error codes, the software must not break when a fifth + code shows up. An undefined code might be logged (see below), + but it must not cause a failure. + + The second part of the principle is almost as important: + software on other hosts may contain deficiencies that make it + unwise to exploit legal but obscure protocol features. It is + unwise to stray far from the obvious and simple, lest untoward + effects result elsewhere. A corollary of this is "watch out + + + +Internet Engineering Task Force [Page 7] + + + + +RFC1123 INTRODUCTION October 1989 + + + for misbehaving hosts"; host software should be prepared, not + just to survive other misbehaving hosts, but also to cooperate + to limit the amount of disruption such hosts can cause to the + shared communication facility. + + 1.2.3 Error Logging + + The Internet includes a great variety of host and gateway + systems, each implementing many protocols and protocol layers, + and some of these contain bugs and mis-features in their + Internet protocol software. As a result of complexity, + diversity, and distribution of function, the diagnosis of user + problems is often very difficult. + + Problem diagnosis will be aided if host implementations include + a carefully designed facility for logging erroneous or + "strange" protocol events. It is important to include as much + diagnostic information as possible when an error is logged. In + particular, it is often useful to record the header(s) of a + packet that caused an error. However, care must be taken to + ensure that error logging does not consume prohibitive amounts + of resources or otherwise interfere with the operation of the + host. + + There is a tendency for abnormal but harmless protocol events + to overflow error logging files; this can be avoided by using a + "circular" log, or by enabling logging only while diagnosing a + known failure. It may be useful to filter and count duplicate + successive messages. One strategy that seems to work well is: + (1) always count abnormalities and make such counts accessible + through the management protocol (see Section 6.3); and (2) + allow the logging of a great variety of events to be + selectively enabled. For example, it might useful to be able + to "log everything" or to "log everything for host X". + + Note that different managements may have differing policies + about the amount of error logging that they want normally + enabled in a host. Some will say, "if it doesn't hurt me, I + don't want to know about it", while others will want to take a + more watchful and aggressive attitude about detecting and + removing protocol abnormalities. + + 1.2.4 Configuration + + It would be ideal if a host implementation of the Internet + protocol suite could be entirely self-configuring. This would + allow the whole suite to be implemented in ROM or cast into + silicon, it would simplify diskless workstations, and it would + + + +Internet Engineering Task Force [Page 8] + + + + +RFC1123 INTRODUCTION October 1989 + + + be an immense boon to harried LAN administrators as well as + system vendors. We have not reached this ideal; in fact, we + are not even close. + + At many points in this document, you will find a requirement + that a parameter be a configurable option. There are several + different reasons behind such requirements. In a few cases, + there is current uncertainty or disagreement about the best + value, and it may be necessary to update the recommended value + in the future. In other cases, the value really depends on + external factors -- e.g., the size of the host and the + distribution of its communication load, or the speeds and + topology of nearby networks -- and self-tuning algorithms are + unavailable and may be insufficient. In some cases, + configurability is needed because of administrative + requirements. + + Finally, some configuration options are required to communicate + with obsolete or incorrect implementations of the protocols, + distributed without sources, that unfortunately persist in many + parts of the Internet. To make correct systems coexist with + these faulty systems, administrators often have to "mis- + configure" the correct systems. This problem will correct + itself gradually as the faulty systems are retired, but it + cannot be ignored by vendors. + + When we say that a parameter must be configurable, we do not + intend to require that its value be explicitly read from a + configuration file at every boot time. We recommend that + implementors set up a default for each parameter, so a + configuration file is only necessary to override those defaults + that are inappropriate in a particular installation. Thus, the + configurability requirement is an assurance that it will be + POSSIBLE to override the default when necessary, even in a + binary-only or ROM-based product. + + This document requires a particular value for such defaults in + some cases. The choice of default is a sensitive issue when + the configuration item controls the accommodation to existing + faulty systems. If the Internet is to converge successfully to + complete interoperability, the default values built into + implementations must implement the official protocol, not + "mis-configurations" to accommodate faulty implementations. + Although marketing considerations have led some vendors to + choose mis-configuration defaults, we urge vendors to choose + defaults that will conform to the standard. + + Finally, we note that a vendor needs to provide adequate + + + +Internet Engineering Task Force [Page 9] + + + + +RFC1123 INTRODUCTION October 1989 + + + documentation on all configuration parameters, their limits and + effects. + + + 1.3 Reading this Document + + 1.3.1 Organization + + In general, each major section is organized into the following + subsections: + + (1) Introduction + + (2) Protocol Walk-Through -- considers the protocol + specification documents section-by-section, correcting + errors, stating requirements that may be ambiguous or + ill-defined, and providing further clarification or + explanation. + + (3) Specific Issues -- discusses protocol design and + implementation issues that were not included in the walk- + through. + + (4) Interfaces -- discusses the service interface to the next + higher layer. + + (5) Summary -- contains a summary of the requirements of the + section. + + Under many of the individual topics in this document, there is + parenthetical material labeled "DISCUSSION" or + "IMPLEMENTATION". This material is intended to give + clarification and explanation of the preceding requirements + text. It also includes some suggestions on possible future + directions or developments. The implementation material + contains suggested approaches that an implementor may want to + consider. + + The summary sections are intended to be guides and indexes to + the text, but are necessarily cryptic and incomplete. The + summaries should never be used or referenced separately from + the complete RFC. + + 1.3.2 Requirements + + In this document, the words that are used to define the + significance of each particular requirement are capitalized. + These words are: + + + +Internet Engineering Task Force [Page 10] + + + + +RFC1123 INTRODUCTION October 1989 + + + * "MUST" + + This word or the adjective "REQUIRED" means that the item + is an absolute requirement of the specification. + + * "SHOULD" + + This word or the adjective "RECOMMENDED" means that there + may exist valid reasons in particular circumstances to + ignore this item, but the full implications should be + understood and the case carefully weighed before choosing + a different course. + + * "MAY" + + This word or the adjective "OPTIONAL" means that this item + is truly optional. One vendor may choose to include the + item because a particular marketplace requires it or + because it enhances the product, for example; another + vendor may omit the same item. + + + An implementation is not compliant if it fails to satisfy one + or more of the MUST requirements for the protocols it + implements. An implementation that satisfies all the MUST and + all the SHOULD requirements for its protocols is said to be + "unconditionally compliant"; one that satisfies all the MUST + requirements but not all the SHOULD requirements for its + protocols is said to be "conditionally compliant". + + 1.3.3 Terminology + + This document uses the following technical terms: + + Segment + A segment is the unit of end-to-end transmission in the + TCP protocol. A segment consists of a TCP header followed + by application data. A segment is transmitted by + encapsulation in an IP datagram. + + Message + This term is used by some application layer protocols + (particularly SMTP) for an application data unit. + + Datagram + A [UDP] datagram is the unit of end-to-end transmission in + the UDP protocol. + + + + +Internet Engineering Task Force [Page 11] + + + + +RFC1123 INTRODUCTION October 1989 + + + Multihomed + A host is said to be multihomed if it has multiple IP + addresses to connected networks. + + + + 1.4 Acknowledgments + + This document incorporates contributions and comments from a large + group of Internet protocol experts, including representatives of + university and research labs, vendors, and government agencies. + It was assembled primarily by the Host Requirements Working Group + of the Internet Engineering Task Force (IETF). + + The Editor would especially like to acknowledge the tireless + dedication of the following people, who attended many long + meetings and generated 3 million bytes of electronic mail over the + past 18 months in pursuit of this document: Philip Almquist, Dave + Borman (Cray Research), Noel Chiappa, Dave Crocker (DEC), Steve + Deering (Stanford), Mike Karels (Berkeley), Phil Karn (Bellcore), + John Lekashman (NASA), Charles Lynn (BBN), Keith McCloghrie (TWG), + Paul Mockapetris (ISI), Thomas Narten (Purdue), Craig Partridge + (BBN), Drew Perkins (CMU), and James Van Bokkelen (FTP Software). + + In addition, the following people made major contributions to the + effort: Bill Barns (Mitre), Steve Bellovin (AT&T), Mike Brescia + (BBN), Ed Cain (DCA), Annette DeSchon (ISI), Martin Gross (DCA), + Phill Gross (NRI), Charles Hedrick (Rutgers), Van Jacobson (LBL), + John Klensin (MIT), Mark Lottor (SRI), Milo Medin (NASA), Bill + Melohn (Sun Microsystems), Greg Minshall (Kinetics), Jeff Mogul + (DEC), John Mullen (CMC), Jon Postel (ISI), John Romkey (Epilogue + Technology), and Mike StJohns (DCA). The following also made + significant contributions to particular areas: Eric Allman + (Berkeley), Rob Austein (MIT), Art Berggreen (ACC), Keith Bostic + (Berkeley), Vint Cerf (NRI), Wayne Hathaway (NASA), Matt Korn + (IBM), Erik Naggum (Naggum Software, Norway), Robert Ullmann + (Prime Computer), David Waitzman (BBN), Frank Wancho (USA), Arun + Welch (Ohio State), Bill Westfield (Cisco), and Rayan Zachariassen + (Toronto). + + We are grateful to all, including any contributors who may have + been inadvertently omitted from this list. + + + + + + + + + +Internet Engineering Task Force [Page 12] + + + + +RFC1123 APPLICATIONS LAYER -- GENERAL October 1989 + + +2. GENERAL ISSUES + + This section contains general requirements that may be applicable to + all application-layer protocols. + + 2.1 Host Names and Numbers + + The syntax of a legal Internet host name was specified in RFC-952 + [DNS:4]. One aspect of host name syntax is hereby changed: the + restriction on the first character is relaxed to allow either a + letter or a digit. Host software MUST support this more liberal + syntax. + + Host software MUST handle host names of up to 63 characters and + SHOULD handle host names of up to 255 characters. + + Whenever a user inputs the identity of an Internet host, it SHOULD + be possible to enter either (1) a host domain name or (2) an IP + address in dotted-decimal ("#.#.#.#") form. The host SHOULD check + the string syntactically for a dotted-decimal number before + looking it up in the Domain Name System. + + DISCUSSION: + This last requirement is not intended to specify the complete + syntactic form for entering a dotted-decimal host number; + that is considered to be a user-interface issue. For + example, a dotted-decimal number must be enclosed within + "[ ]" brackets for SMTP mail (see Section 5.2.17). This + notation could be made universal within a host system, + simplifying the syntactic checking for a dotted-decimal + number. + + If a dotted-decimal number can be entered without such + identifying delimiters, then a full syntactic check must be + made, because a segment of a host domain name is now allowed + to begin with a digit and could legally be entirely numeric + (see Section 6.1.2.4). However, a valid host name can never + have the dotted-decimal form #.#.#.#, since at least the + highest-level component label will be alphabetic. + + 2.2 Using Domain Name Service + + Host domain names MUST be translated to IP addresses as described + in Section 6.1. + + Applications using domain name services MUST be able to cope with + soft error conditions. Applications MUST wait a reasonable + interval between successive retries due to a soft error, and MUST + + + +Internet Engineering Task Force [Page 13] + + + + +RFC1123 APPLICATIONS LAYER -- GENERAL October 1989 + + + allow for the possibility that network problems may deny service + for hours or even days. + + An application SHOULD NOT rely on the ability to locate a WKS + record containing an accurate listing of all services at a + particular host address, since the WKS RR type is not often used + by Internet sites. To confirm that a service is present, simply + attempt to use it. + + 2.3 Applications on Multihomed hosts + + When the remote host is multihomed, the name-to-address + translation will return a list of alternative IP addresses. As + specified in Section 6.1.3.4, this list should be in order of + decreasing preference. Application protocol implementations + SHOULD be prepared to try multiple addresses from the list until + success is obtained. More specific requirements for SMTP are + given in Section 5.3.4. + + When the local host is multihomed, a UDP-based request/response + application SHOULD send the response with an IP source address + that is the same as the specific destination address of the UDP + request datagram. The "specific destination address" is defined + in the "IP Addressing" section of the companion RFC [INTRO:1]. + + Similarly, a server application that opens multiple TCP + connections to the same client SHOULD use the same local IP + address for all. + + 2.4 Type-of-Service + + Applications MUST select appropriate TOS values when they invoke + transport layer services, and these values MUST be configurable. + Note that a TOS value contains 5 bits, of which only the most- + significant 3 bits are currently defined; the other two bits MUST + be zero. + + DISCUSSION: + As gateway algorithms are developed to implement Type-of- + Service, the recommended values for various application + protocols may change. In addition, it is likely that + particular combinations of users and Internet paths will want + non-standard TOS values. For these reasons, the TOS values + must be configurable. + + See the latest version of the "Assigned Numbers" RFC + [INTRO:5] for the recommended TOS values for the major + application protocols. + + + +Internet Engineering Task Force [Page 14] + + + + +RFC1123 APPLICATIONS LAYER -- GENERAL October 1989 + + + 2.5 GENERAL APPLICATION REQUIREMENTS SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-----------------------------------------------|----------|-|-|-|-|-|-- + | | | | | | | +User interfaces: | | | | | | | + Allow host name to begin with digit |2.1 |x| | | | | + Host names of up to 635 characters |2.1 |x| | | | | + Host names of up to 255 characters |2.1 | |x| | | | + Support dotted-decimal host numbers |2.1 | |x| | | | + Check syntactically for dotted-dec first |2.1 | |x| | | | + | | | | | | | +Map domain names per Section 6.1 |2.2 |x| | | | | +Cope with soft DNS errors |2.2 |x| | | | | + Reasonable interval between retries |2.2 |x| | | | | + Allow for long outages |2.2 |x| | | | | +Expect WKS records to be available |2.2 | | | |x| | + | | | | | | | +Try multiple addr's for remote multihomed host |2.3 | |x| | | | +UDP reply src addr is specific dest of request |2.3 | |x| | | | +Use same IP addr for related TCP connections |2.3 | |x| | | | +Specify appropriate TOS values |2.4 |x| | | | | + TOS values configurable |2.4 |x| | | | | + Unused TOS bits zero |2.4 |x| | | | | + | | | | | | | + | | | | | | | + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 15] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + +3. REMOTE LOGIN -- TELNET PROTOCOL + + 3.1 INTRODUCTION + + Telnet is the standard Internet application protocol for remote + login. It provides the encoding rules to link a user's + keyboard/display on a client ("user") system with a command + interpreter on a remote server system. A subset of the Telnet + protocol is also incorporated within other application protocols, + e.g., FTP and SMTP. + + Telnet uses a single TCP connection, and its normal data stream + ("Network Virtual Terminal" or "NVT" mode) is 7-bit ASCII with + escape sequences to embed control functions. Telnet also allows + the negotiation of many optional modes and functions. + + The primary Telnet specification is to be found in RFC-854 + [TELNET:1], while the options are defined in many other RFCs; see + Section 7 for references. + + 3.2 PROTOCOL WALK-THROUGH + + 3.2.1 Option Negotiation: RFC-854, pp. 2-3 + + Every Telnet implementation MUST include option negotiation and + subnegotiation machinery [TELNET:2]. + + A host MUST carefully follow the rules of RFC-854 to avoid + option-negotiation loops. A host MUST refuse (i.e, reply + WONT/DONT to a DO/WILL) an unsupported option. Option + negotiation SHOULD continue to function (even if all requests + are refused) throughout the lifetime of a Telnet connection. + + If all option negotiations fail, a Telnet implementation MUST + default to, and support, an NVT. + + DISCUSSION: + Even though more sophisticated "terminals" and supporting + option negotiations are becoming the norm, all + implementations must be prepared to support an NVT for any + user-server communication. + + 3.2.2 Telnet Go-Ahead Function: RFC-854, p. 5, and RFC-858 + + On a host that never sends the Telnet command Go Ahead (GA), + the Telnet Server MUST attempt to negotiate the Suppress Go + Ahead option (i.e., send "WILL Suppress Go Ahead"). A User or + Server Telnet MUST always accept negotiation of the Suppress Go + + + +Internet Engineering Task Force [Page 16] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + Ahead option. + + When it is driving a full-duplex terminal for which GA has no + meaning, a User Telnet implementation MAY ignore GA commands. + + DISCUSSION: + Half-duplex ("locked-keyboard") line-at-a-time terminals + for which the Go-Ahead mechanism was designed have largely + disappeared from the scene. It turned out to be difficult + to implement sending the Go-Ahead signal in many operating + systems, even some systems that support native half-duplex + terminals. The difficulty is typically that the Telnet + server code does not have access to information about + whether the user process is blocked awaiting input from + the Telnet connection, i.e., it cannot reliably determine + when to send a GA command. Therefore, most Telnet Server + hosts do not send GA commands. + + The effect of the rules in this section is to allow either + end of a Telnet connection to veto the use of GA commands. + + There is a class of half-duplex terminals that is still + commercially important: "data entry terminals," which + interact in a full-screen manner. However, supporting + data entry terminals using the Telnet protocol does not + require the Go Ahead signal; see Section 3.3.2. + + 3.2.3 Control Functions: RFC-854, pp. 7-8 + + The list of Telnet commands has been extended to include EOR + (End-of-Record), with code 239 [TELNET:9]. + + Both User and Server Telnets MAY support the control functions + EOR, EC, EL, and Break, and MUST support AO, AYT, DM, IP, NOP, + SB, and SE. + + A host MUST be able to receive and ignore any Telnet control + functions that it does not support. + + DISCUSSION: + Note that a Server Telnet is required to support the + Telnet IP (Interrupt Process) function, even if the server + host has an equivalent in-stream function (e.g., Control-C + in many systems). The Telnet IP function may be stronger + than an in-stream interrupt command, because of the out- + of-band effect of TCP urgent data. + + The EOR control function may be used to delimit the + + + +Internet Engineering Task Force [Page 17] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + stream. An important application is data entry terminal + support (see Section 3.3.2). There was concern that since + EOR had not been defined in RFC-854, a host that was not + prepared to correctly ignore unknown Telnet commands might + crash if it received an EOR. To protect such hosts, the + End-of-Record option [TELNET:9] was introduced; however, a + properly implemented Telnet program will not require this + protection. + + 3.2.4 Telnet "Synch" Signal: RFC-854, pp. 8-10 + + When it receives "urgent" TCP data, a User or Server Telnet + MUST discard all data except Telnet commands until the DM (and + end of urgent) is reached. + + When it sends Telnet IP (Interrupt Process), a User Telnet + SHOULD follow it by the Telnet "Synch" sequence, i.e., send as + TCP urgent data the sequence "IAC IP IAC DM". The TCP urgent + pointer points to the DM octet. + + When it receives a Telnet IP command, a Server Telnet MAY send + a Telnet "Synch" sequence back to the user, to flush the output + stream. The choice ought to be consistent with the way the + server operating system behaves when a local user interrupts a + process. + + When it receives a Telnet AO command, a Server Telnet MUST send + a Telnet "Synch" sequence back to the user, to flush the output + stream. + + A User Telnet SHOULD have the capability of flushing output + when it sends a Telnet IP; see also Section 3.4.5. + + DISCUSSION: + There are three possible ways for a User Telnet to flush + the stream of server output data: + + (1) Send AO after IP. + + This will cause the server host to send a "flush- + buffered-output" signal to its operating system. + However, the AO may not take effect locally, i.e., + stop terminal output at the User Telnet end, until + the Server Telnet has received and processed the AO + and has sent back a "Synch". + + (2) Send DO TIMING-MARK [TELNET:7] after IP, and discard + all output locally until a WILL/WONT TIMING-MARK is + + + +Internet Engineering Task Force [Page 18] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + received from the Server Telnet. + + Since the DO TIMING-MARK will be processed after the + IP at the server, the reply to it should be in the + right place in the output data stream. However, the + TIMING-MARK will not send a "flush buffered output" + signal to the server operating system. Whether or + not this is needed is dependent upon the server + system. + + (3) Do both. + + The best method is not entirely clear, since it must + accommodate a number of existing server hosts that do not + follow the Telnet standards in various ways. The safest + approach is probably to provide a user-controllable option + to select (1), (2), or (3). + + 3.2.5 NVT Printer and Keyboard: RFC-854, p. 11 + + In NVT mode, a Telnet SHOULD NOT send characters with the + high-order bit 1, and MUST NOT send it as a parity bit. + Implementations that pass the high-order bit to applications + SHOULD negotiate binary mode (see Section 3.2.6). + + + DISCUSSION: + Implementors should be aware that a strict reading of + RFC-854 allows a client or server expecting NVT ASCII to + ignore characters with the high-order bit set. In + general, binary mode is expected to be used for + transmission of an extended (beyond 7-bit) character set + with Telnet. + + However, there exist applications that really need an 8- + bit NVT mode, which is currently not defined, and these + existing applications do set the high-order bit during + part or all of the life of a Telnet connection. Note that + binary mode is not the same as 8-bit NVT mode, since + binary mode turns off end-of-line processing. For this + reason, the requirements on the high-order bit are stated + as SHOULD, not MUST. + + RFC-854 defines a minimal set of properties of a "network + virtual terminal" or NVT; this is not meant to preclude + additional features in a real terminal. A Telnet + connection is fully transparent to all 7-bit ASCII + characters, including arbitrary ASCII control characters. + + + +Internet Engineering Task Force [Page 19] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + For example, a terminal might support full-screen commands + coded as ASCII escape sequences; a Telnet implementation + would pass these sequences as uninterpreted data. Thus, + an NVT should not be conceived as a terminal type of a + highly-restricted device. + + 3.2.6 Telnet Command Structure: RFC-854, p. 13 + + Since options may appear at any point in the data stream, a + Telnet escape character (known as IAC, with the value 255) to + be sent as data MUST be doubled. + + 3.2.7 Telnet Binary Option: RFC-856 + + When the Binary option has been successfully negotiated, + arbitrary 8-bit characters are allowed. However, the data + stream MUST still be scanned for IAC characters, any embedded + Telnet commands MUST be obeyed, and data bytes equal to IAC + MUST be doubled. Other character processing (e.g., replacing + CR by CR NUL or by CR LF) MUST NOT be done. In particular, + there is no end-of-line convention (see Section 3.3.1) in + binary mode. + + DISCUSSION: + The Binary option is normally negotiated in both + directions, to change the Telnet connection from NVT mode + to "binary mode". + + The sequence IAC EOR can be used to delimit blocks of data + within a binary-mode Telnet stream. + + 3.2.8 Telnet Terminal-Type Option: RFC-1091 + + The Terminal-Type option MUST use the terminal type names + officially defined in the Assigned Numbers RFC [INTRO:5], when + they are available for the particular terminal. However, the + receiver of a Terminal-Type option MUST accept any name. + + DISCUSSION: + RFC-1091 [TELNET:10] updates an earlier version of the + Terminal-Type option defined in RFC-930. The earlier + version allowed a server host capable of supporting + multiple terminal types to learn the type of a particular + client's terminal, assuming that each physical terminal + had an intrinsic type. However, today a "terminal" is + often really a terminal emulator program running in a PC, + perhaps capable of emulating a range of terminal types. + Therefore, RFC-1091 extends the specification to allow a + + + +Internet Engineering Task Force [Page 20] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + more general terminal-type negotiation between User and + Server Telnets. + + 3.3 SPECIFIC ISSUES + + 3.3.1 Telnet End-of-Line Convention + + The Telnet protocol defines the sequence CR LF to mean "end- + of-line". For terminal input, this corresponds to a command- + completion or "end-of-line" key being pressed on a user + terminal; on an ASCII terminal, this is the CR key, but it may + also be labelled "Return" or "Enter". + + When a Server Telnet receives the Telnet end-of-line sequence + CR LF as input from a remote terminal, the effect MUST be the + same as if the user had pressed the "end-of-line" key on a + local terminal. On server hosts that use ASCII, in particular, + receipt of the Telnet sequence CR LF must cause the same effect + as a local user pressing the CR key on a local terminal. Thus, + CR LF and CR NUL MUST have the same effect on an ASCII server + host when received as input over a Telnet connection. + + A User Telnet MUST be able to send any of the forms: CR LF, CR + NUL, and LF. A User Telnet on an ASCII host SHOULD have a + user-controllable mode to send either CR LF or CR NUL when the + user presses the "end-of-line" key, and CR LF SHOULD be the + default. + + The Telnet end-of-line sequence CR LF MUST be used to send + Telnet data that is not terminal-to-computer (e.g., for Server + Telnet sending output, or the Telnet protocol incorporated + another application protocol). + + DISCUSSION: + To allow interoperability between arbitrary Telnet clients + and servers, the Telnet protocol defined a standard + representation for a line terminator. Since the ASCII + character set includes no explicit end-of-line character, + systems have chosen various representations, e.g., CR, LF, + and the sequence CR LF. The Telnet protocol chose the CR + LF sequence as the standard for network transmission. + + Unfortunately, the Telnet protocol specification in RFC- + 854 [TELNET:1] has turned out to be somewhat ambiguous on + what character(s) should be sent from client to server for + the "end-of-line" key. The result has been a massive and + continuing interoperability headache, made worse by + various faulty implementations of both User and Server + + + +Internet Engineering Task Force [Page 21] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + Telnets. + + Although the Telnet protocol is based on a perfectly + symmetric model, in a remote login session the role of the + user at a terminal differs from the role of the server + host. For example, RFC-854 defines the meaning of CR, LF, + and CR LF as output from the server, but does not specify + what the User Telnet should send when the user presses the + "end-of-line" key on the terminal; this turns out to be + the point at issue. + + When a user presses the "end-of-line" key, some User + Telnet implementations send CR LF, while others send CR + NUL (based on a different interpretation of the same + sentence in RFC-854). These will be equivalent for a + correctly-implemented ASCII server host, as discussed + above. For other servers, a mode in the User Telnet is + needed. + + The existence of User Telnets that send only CR NUL when + CR is pressed creates a dilemma for non-ASCII hosts: they + can either treat CR NUL as equivalent to CR LF in input, + thus precluding the possibility of entering a "bare" CR, + or else lose complete interworking. + + Suppose a user on host A uses Telnet to log into a server + host B, and then execute B's User Telnet program to log + into server host C. It is desirable for the Server/User + Telnet combination on B to be as transparent as possible, + i.e., to appear as if A were connected directly to C. In + particular, correct implementation will make B transparent + to Telnet end-of-line sequences, except that CR LF may be + translated to CR NUL or vice versa. + + IMPLEMENTATION: + To understand Telnet end-of-line issues, one must have at + least a general model of the relationship of Telnet to the + local operating system. The Server Telnet process is + typically coupled into the terminal driver software of the + operating system as a pseudo-terminal. A Telnet end-of- + line sequence received by the Server Telnet must have the + same effect as pressing the end-of-line key on a real + locally-connected terminal. + + Operating systems that support interactive character-at- + a-time applications (e.g., editors) typically have two + internal modes for their terminal I/O: a formatted mode, + in which local conventions for end-of-line and other + + + +Internet Engineering Task Force [Page 22] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + formatting rules have been applied to the data stream, and + a "raw" mode, in which the application has direct access + to every character as it was entered. A Server Telnet + must be implemented in such a way that these modes have + the same effect for remote as for local terminals. For + example, suppose a CR LF or CR NUL is received by the + Server Telnet on an ASCII host. In raw mode, a CR + character is passed to the application; in formatted mode, + the local system's end-of-line convention is used. + + 3.3.2 Data Entry Terminals + + DISCUSSION: + In addition to the line-oriented and character-oriented + ASCII terminals for which Telnet was designed, there are + several families of video display terminals that are + sometimes known as "data entry terminals" or DETs. The + IBM 3270 family is a well-known example. + + Two Internet protocols have been designed to support + generic DETs: SUPDUP [TELNET:16, TELNET:17], and the DET + option [TELNET:18, TELNET:19]. The DET option drives a + data entry terminal over a Telnet connection using (sub-) + negotiation. SUPDUP is a completely separate terminal + protocol, which can be entered from Telnet by negotiation. + Although both SUPDUP and the DET option have been used + successfully in particular environments, neither has + gained general acceptance or wide implementation. + + A different approach to DET interaction has been developed + for supporting the IBM 3270 family through Telnet, + although the same approach would be applicable to any DET. + The idea is to enter a "native DET" mode, in which the + native DET input/output stream is sent as binary data. + The Telnet EOR command is used to delimit logical records + (e.g., "screens") within this binary stream. + + IMPLEMENTATION: + The rules for entering and leaving native DET mode are as + follows: + + o The Server uses the Terminal-Type option [TELNET:10] + to learn that the client is a DET. + + o It is conventional, but not required, that both ends + negotiate the EOR option [TELNET:9]. + + o Both ends negotiate the Binary option [TELNET:3] to + + + +Internet Engineering Task Force [Page 23] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + enter native DET mode. + + o When either end negotiates out of binary mode, the + other end does too, and the mode then reverts to + normal NVT. + + + 3.3.3 Option Requirements + + Every Telnet implementation MUST support the Binary option + [TELNET:3] and the Suppress Go Ahead option [TELNET:5], and + SHOULD support the Echo [TELNET:4], Status [TELNET:6], End-of- + Record [TELNET:9], and Extended Options List [TELNET:8] + options. + + A User or Server Telnet SHOULD support the Window Size Option + [TELNET:12] if the local operating system provides the + corresponding capability. + + DISCUSSION: + Note that the End-of-Record option only signifies that a + Telnet can receive a Telnet EOR without crashing; + therefore, every Telnet ought to be willing to accept + negotiation of the End-of-Record option. See also the + discussion in Section 3.2.3. + + 3.3.4 Option Initiation + + When the Telnet protocol is used in a client/server situation, + the server SHOULD initiate negotiation of the terminal + interaction mode it expects. + + DISCUSSION: + The Telnet protocol was defined to be perfectly + symmetrical, but its application is generally asymmetric. + Remote login has been known to fail because NEITHER side + initiated negotiation of the required non-default terminal + modes. It is generally the server that determines the + preferred mode, so the server needs to initiate the + negotiation; since the negotiation is symmetric, the user + can also initiate it. + + A client (User Telnet) SHOULD provide a means for users to + enable and disable the initiation of option negotiation. + + DISCUSSION: + A user sometimes needs to connect to an application + service (e.g., FTP or SMTP) that uses Telnet for its + + + +Internet Engineering Task Force [Page 24] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + control stream but does not support Telnet options. User + Telnet may be used for this purpose if initiation of + option negotiation is disabled. + + 3.3.5 Telnet Linemode Option + + DISCUSSION: + An important new Telnet option, LINEMODE [TELNET:12], has + been proposed. The LINEMODE option provides a standard + way for a User Telnet and a Server Telnet to agree that + the client rather than the server will perform terminal + character processing. When the client has prepared a + complete line of text, it will send it to the server in + (usually) one TCP packet. This option will greatly + decrease the packet cost of Telnet sessions and will also + give much better user response over congested or long- + delay networks. + + The LINEMODE option allows dynamic switching between local + and remote character processing. For example, the Telnet + connection will automatically negotiate into single- + character mode while a full screen editor is running, and + then return to linemode when the editor is finished. + + We expect that when this RFC is released, hosts should + implement the client side of this option, and may + implement the server side of this option. To properly + implement the server side, the server needs to be able to + tell the local system not to do any input character + processing, but to remember its current terminal state and + notify the Server Telnet process whenever the state + changes. This will allow password echoing and full screen + editors to be handled properly, for example. + + 3.4 TELNET/USER INTERFACE + + 3.4.1 Character Set Transparency + + User Telnet implementations SHOULD be able to send or receive + any 7-bit ASCII character. Where possible, any special + character interpretations by the user host's operating system + SHOULD be bypassed so that these characters can conveniently be + sent and received on the connection. + + Some character value MUST be reserved as "escape to command + mode"; conventionally, doubling this character allows it to be + entered as data. The specific character used SHOULD be user + selectable. + + + +Internet Engineering Task Force [Page 25] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + On binary-mode connections, a User Telnet program MAY provide + an escape mechanism for entering arbitrary 8-bit values, if the + host operating system doesn't allow them to be entered directly + from the keyboard. + + IMPLEMENTATION: + The transparency issues are less pressing on servers, but + implementors should take care in dealing with issues like: + masking off parity bits (sent by an older, non-conforming + client) before they reach programs that expect only NVT + ASCII, and properly handling programs that request 8-bit + data streams. + + 3.4.2 Telnet Commands + + A User Telnet program MUST provide a user the capability of + entering any of the Telnet control functions IP, AO, or AYT, + and SHOULD provide the capability of entering EC, EL, and + Break. + + 3.4.3 TCP Connection Errors + + A User Telnet program SHOULD report to the user any TCP errors + that are reported by the transport layer (see "TCP/Application + Layer Interface" section in [INTRO:1]). + + 3.4.4 Non-Default Telnet Contact Port + + A User Telnet program SHOULD allow the user to optionally + specify a non-standard contact port number at the Server Telnet + host. + + 3.4.5 Flushing Output + + A User Telnet program SHOULD provide the user the ability to + specify whether or not output should be flushed when an IP is + sent; see Section 3.2.4. + + For any output flushing scheme that causes the User Telnet to + flush output locally until a Telnet signal is received from the + Server, there SHOULD be a way for the user to manually restore + normal output, in case the Server fails to send the expected + signal. + + + + + + + + +Internet Engineering Task Force [Page 26] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + 3.5. TELNET REQUIREMENTS SUMMARY + + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-------------------------------------------------|--------|-|-|-|-|-|-- + | | | | | | | +Option Negotiation |3.2.1 |x| | | | | + Avoid negotiation loops |3.2.1 |x| | | | | + Refuse unsupported options |3.2.1 |x| | | | | + Negotiation OK anytime on connection |3.2.1 | |x| | | | + Default to NVT |3.2.1 |x| | | | | + Send official name in Term-Type option |3.2.8 |x| | | | | + Accept any name in Term-Type option |3.2.8 |x| | | | | + Implement Binary, Suppress-GA options |3.3.3 |x| | | | | + Echo, Status, EOL, Ext-Opt-List options |3.3.3 | |x| | | | + Implement Window-Size option if appropriate |3.3.3 | |x| | | | + Server initiate mode negotiations |3.3.4 | |x| | | | + User can enable/disable init negotiations |3.3.4 | |x| | | | + | | | | | | | +Go-Aheads | | | | | | | + Non-GA server negotiate SUPPRESS-GA option |3.2.2 |x| | | | | + User or Server accept SUPPRESS-GA option |3.2.2 |x| | | | | + User Telnet ignore GA's |3.2.2 | | |x| | | + | | | | | | | +Control Functions | | | | | | | + Support SE NOP DM IP AO AYT SB |3.2.3 |x| | | | | + Support EOR EC EL Break |3.2.3 | | |x| | | + Ignore unsupported control functions |3.2.3 |x| | | | | + User, Server discard urgent data up to DM |3.2.4 |x| | | | | + User Telnet send "Synch" after IP, AO, AYT |3.2.4 | |x| | | | + Server Telnet reply Synch to IP |3.2.4 | | |x| | | + Server Telnet reply Synch to AO |3.2.4 |x| | | | | + User Telnet can flush output when send IP |3.2.4 | |x| | | | + | | | | | | | +Encoding | | | | | | | + Send high-order bit in NVT mode |3.2.5 | | | |x| | + Send high-order bit as parity bit |3.2.5 | | | | |x| + Negot. BINARY if pass high-ord. bit to applic |3.2.5 | |x| | | | + Always double IAC data byte |3.2.6 |x| | | | | + + + +Internet Engineering Task Force [Page 27] + + + + +RFC1123 REMOTE LOGIN -- TELNET October 1989 + + + Double IAC data byte in binary mode |3.2.7 |x| | | | | + Obey Telnet cmds in binary mode |3.2.7 |x| | | | | + End-of-line, CR NUL in binary mode |3.2.7 | | | | |x| + | | | | | | | +End-of-Line | | | | | | | + EOL at Server same as local end-of-line |3.3.1 |x| | | | | + ASCII Server accept CR LF or CR NUL for EOL |3.3.1 |x| | | | | + User Telnet able to send CR LF, CR NUL, or LF |3.3.1 |x| | | | | + ASCII user able to select CR LF/CR NUL |3.3.1 | |x| | | | + User Telnet default mode is CR LF |3.3.1 | |x| | | | + Non-interactive uses CR LF for EOL |3.3.1 |x| | | | | + | | | | | | | +User Telnet interface | | | | | | | + Input & output all 7-bit characters |3.4.1 | |x| | | | + Bypass local op sys interpretation |3.4.1 | |x| | | | + Escape character |3.4.1 |x| | | | | + User-settable escape character |3.4.1 | |x| | | | + Escape to enter 8-bit values |3.4.1 | | |x| | | + Can input IP, AO, AYT |3.4.2 |x| | | | | + Can input EC, EL, Break |3.4.2 | |x| | | | + Report TCP connection errors to user |3.4.3 | |x| | | | + Optional non-default contact port |3.4.4 | |x| | | | + Can spec: output flushed when IP sent |3.4.5 | |x| | | | + Can manually restore output mode |3.4.5 | |x| | | | + | | | | | | | + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 28] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + +4. FILE TRANSFER + + 4.1 FILE TRANSFER PROTOCOL -- FTP + + 4.1.1 INTRODUCTION + + The File Transfer Protocol FTP is the primary Internet standard + for file transfer. The current specification is contained in + RFC-959 [FTP:1]. + + FTP uses separate simultaneous TCP connections for control and + for data transfer. The FTP protocol includes many features, + some of which are not commonly implemented. However, for every + feature in FTP, there exists at least one implementation. The + minimum implementation defined in RFC-959 was too small, so a + somewhat larger minimum implementation is defined here. + + Internet users have been unnecessarily burdened for years by + deficient FTP implementations. Protocol implementors have + suffered from the erroneous opinion that implementing FTP ought + to be a small and trivial task. This is wrong, because FTP has + a user interface, because it has to deal (correctly) with the + whole variety of communication and operating system errors that + may occur, and because it has to handle the great diversity of + real file systems in the world. + + 4.1.2. PROTOCOL WALK-THROUGH + + 4.1.2.1 LOCAL Type: RFC-959 Section 3.1.1.4 + + An FTP program MUST support TYPE I ("IMAGE" or binary type) + as well as TYPE L 8 ("LOCAL" type with logical byte size 8). + A machine whose memory is organized into m-bit words, where + m is not a multiple of 8, MAY also support TYPE L m. + + DISCUSSION: + The command "TYPE L 8" is often required to transfer + binary data between a machine whose memory is organized + into (e.g.) 36-bit words and a machine with an 8-bit + byte organization. For an 8-bit byte machine, TYPE L 8 + is equivalent to IMAGE. + + "TYPE L m" is sometimes specified to the FTP programs + on two m-bit word machines to ensure the correct + transfer of a native-mode binary file from one machine + to the other. However, this command should have the + same effect on these machines as "TYPE I". + + + + +Internet Engineering Task Force [Page 29] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + 4.1.2.2 Telnet Format Control: RFC-959 Section 3.1.1.5.2 + + A host that makes no distinction between TYPE N and TYPE T + SHOULD implement TYPE T to be identical to TYPE N. + + DISCUSSION: + This provision should ease interoperation with hosts + that do make this distinction. + + Many hosts represent text files internally as strings + of ASCII characters, using the embedded ASCII format + effector characters (LF, BS, FF, ...) to control the + format when a file is printed. For such hosts, there + is no distinction between "print" files and other + files. However, systems that use record structured + files typically need a special format for printable + files (e.g., ASA carriage control). For the latter + hosts, FTP allows a choice of TYPE N or TYPE T. + + 4.1.2.3 Page Structure: RFC-959 Section 3.1.2.3 and Appendix I + + Implementation of page structure is NOT RECOMMENDED in + general. However, if a host system does need to implement + FTP for "random access" or "holey" files, it MUST use the + defined page structure format rather than define a new + private FTP format. + + 4.1.2.4 Data Structure Transformations: RFC-959 Section 3.1.2 + + An FTP transformation between record-structure and file- + structure SHOULD be invertible, to the extent possible while + making the result useful on the target host. + + DISCUSSION: + RFC-959 required strict invertibility between record- + structure and file-structure, but in practice, + efficiency and convenience often preclude it. + Therefore, the requirement is being relaxed. There are + two different objectives for transferring a file: + processing it on the target host, or just storage. For + storage, strict invertibility is important. For + processing, the file created on the target host needs + to be in the format expected by application programs on + that host. + + As an example of the conflict, imagine a record- + oriented operating system that requires some data files + to have exactly 80 bytes in each record. While STORing + + + +Internet Engineering Task Force [Page 30] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + a file on such a host, an FTP Server must be able to + pad each line or record to 80 bytes; a later retrieval + of such a file cannot be strictly invertible. + + 4.1.2.5 Data Connection Management: RFC-959 Section 3.3 + + A User-FTP that uses STREAM mode SHOULD send a PORT command + to assign a non-default data port before each transfer + command is issued. + + DISCUSSION: + This is required because of the long delay after a TCP + connection is closed until its socket pair can be + reused, to allow multiple transfers during a single FTP + session. Sending a port command can avoided if a + transfer mode other than stream is used, by leaving the + data transfer connection open between transfers. + + 4.1.2.6 PASV Command: RFC-959 Section 4.1.2 + + A server-FTP MUST implement the PASV command. + + If multiple third-party transfers are to be executed during + the same session, a new PASV command MUST be issued before + each transfer command, to obtain a unique port pair. + + IMPLEMENTATION: + The format of the 227 reply to a PASV command is not + well standardized. In particular, an FTP client cannot + assume that the parentheses shown on page 40 of RFC-959 + will be present (and in fact, Figure 3 on page 43 omits + them). Therefore, a User-FTP program that interprets + the PASV reply must scan the reply for the first digit + of the host and port numbers. + + Note that the host number h1,h2,h3,h4 is the IP address + of the server host that is sending the reply, and that + p1,p2 is a non-default data transfer port that PASV has + assigned. + + 4.1.2.7 LIST and NLST Commands: RFC-959 Section 4.1.3 + + The data returned by an NLST command MUST contain only a + simple list of legal pathnames, such that the server can use + them directly as the arguments of subsequent data transfer + commands for the individual files. + + The data returned by a LIST or NLST command SHOULD use an + + + +Internet Engineering Task Force [Page 31] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + implied TYPE AN, unless the current type is EBCDIC, in which + case an implied TYPE EN SHOULD be used. + + DISCUSSION: + Many FTP clients support macro-commands that will get + or put files matching a wildcard specification, using + NLST to obtain a list of pathnames. The expansion of + "multiple-put" is local to the client, but "multiple- + get" requires cooperation by the server. + + The implied type for LIST and NLST is designed to + provide compatibility with existing User-FTPs, and in + particular with multiple-get commands. + + 4.1.2.8 SITE Command: RFC-959 Section 4.1.3 + + A Server-FTP SHOULD use the SITE command for non-standard + features, rather than invent new private commands or + unstandardized extensions to existing commands. + + 4.1.2.9 STOU Command: RFC-959 Section 4.1.3 + + The STOU command stores into a uniquely named file. When it + receives an STOU command, a Server-FTP MUST return the + actual file name in the "125 Transfer Starting" or the "150 + Opening Data Connection" message that precedes the transfer + (the 250 reply code mentioned in RFC-959 is incorrect). The + exact format of these messages is hereby defined to be as + follows: + + 125 FILE: pppp + 150 FILE: pppp + + where pppp represents the unique pathname of the file that + will be written. + + 4.1.2.10 Telnet End-of-line Code: RFC-959, Page 34 + + Implementors MUST NOT assume any correspondence between READ + boundaries on the control connection and the Telnet EOL + sequences (CR LF). + + DISCUSSION: + Thus, a server-FTP (or User-FTP) must continue reading + characters from the control connection until a complete + Telnet EOL sequence is encountered, before processing + the command (or response, respectively). Conversely, a + single READ from the control connection may include + + + +Internet Engineering Task Force [Page 32] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + more than one FTP command. + + 4.1.2.11 FTP Replies: RFC-959 Section 4.2, Page 35 + + A Server-FTP MUST send only correctly formatted replies on + the control connection. Note that RFC-959 (unlike earlier + versions of the FTP spec) contains no provision for a + "spontaneous" reply message. + + A Server-FTP SHOULD use the reply codes defined in RFC-959 + whenever they apply. However, a server-FTP MAY use a + different reply code when needed, as long as the general + rules of Section 4.2 are followed. When the implementor has + a choice between a 4xx and 5xx reply code, a Server-FTP + SHOULD send a 4xx (temporary failure) code when there is any + reasonable possibility that a failed FTP will succeed a few + hours later. + + A User-FTP SHOULD generally use only the highest-order digit + of a 3-digit reply code for making a procedural decision, to + prevent difficulties when a Server-FTP uses non-standard + reply codes. + + A User-FTP MUST be able to handle multi-line replies. If + the implementation imposes a limit on the number of lines + and if this limit is exceeded, the User-FTP MUST recover, + e.g., by ignoring the excess lines until the end of the + multi-line reply is reached. + + A User-FTP SHOULD NOT interpret a 421 reply code ("Service + not available, closing control connection") specially, but + SHOULD detect closing of the control connection by the + server. + + DISCUSSION: + Server implementations that fail to strictly follow the + reply rules often cause FTP user programs to hang. + Note that RFC-959 resolved ambiguities in the reply + rules found in earlier FTP specifications and must be + followed. + + It is important to choose FTP reply codes that properly + distinguish between temporary and permanent failures, + to allow the successful use of file transfer client + daemons. These programs depend on the reply codes to + decide whether or not to retry a failed transfer; using + a permanent failure code (5xx) for a temporary error + will cause these programs to give up unnecessarily. + + + +Internet Engineering Task Force [Page 33] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + When the meaning of a reply matches exactly the text + shown in RFC-959, uniformity will be enhanced by using + the RFC-959 text verbatim. However, a Server-FTP + implementor is encouraged to choose reply text that + conveys specific system-dependent information, when + appropriate. + + 4.1.2.12 Connections: RFC-959 Section 5.2 + + The words "and the port used" in the second paragraph of + this section of RFC-959 are erroneous (historical), and they + should be ignored. + + On a multihomed server host, the default data transfer port + (L-1) MUST be associated with the same local IP address as + the corresponding control connection to port L. + + A user-FTP MUST NOT send any Telnet controls other than + SYNCH and IP on an FTP control connection. In particular, it + MUST NOT attempt to negotiate Telnet options on the control + connection. However, a server-FTP MUST be capable of + accepting and refusing Telnet negotiations (i.e., sending + DONT/WONT). + + DISCUSSION: + Although the RFC says: "Server- and User- processes + should follow the conventions for the Telnet + protocol...[on the control connection]", it is not the + intent that Telnet option negotiation is to be + employed. + + 4.1.2.13 Minimum Implementation; RFC-959 Section 5.1 + + The following commands and options MUST be supported by + every server-FTP and user-FTP, except in cases where the + underlying file system or operating system does not allow or + support a particular command. + + Type: ASCII Non-print, IMAGE, LOCAL 8 + Mode: Stream + Structure: File, Record* + Commands: + USER, PASS, ACCT, + PORT, PASV, + TYPE, MODE, STRU, + RETR, STOR, APPE, + RNFR, RNTO, DELE, + CWD, CDUP, RMD, MKD, PWD, + + + +Internet Engineering Task Force [Page 34] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + LIST, NLST, + SYST, STAT, + HELP, NOOP, QUIT. + + *Record structure is REQUIRED only for hosts whose file + systems support record structure. + + DISCUSSION: + Vendors are encouraged to implement a larger subset of + the protocol. For example, there are important + robustness features in the protocol (e.g., Restart, + ABOR, block mode) that would be an aid to some Internet + users but are not widely implemented. + + A host that does not have record structures in its file + system may still accept files with STRU R, recording + the byte stream literally. + + 4.1.3 SPECIFIC ISSUES + + 4.1.3.1 Non-standard Command Verbs + + FTP allows "experimental" commands, whose names begin with + "X". If these commands are subsequently adopted as + standards, there may still be existing implementations using + the "X" form. At present, this is true for the directory + commands: + + RFC-959 "Experimental" + + MKD XMKD + RMD XRMD + PWD XPWD + CDUP XCUP + CWD XCWD + + All FTP implementations SHOULD recognize both forms of these + commands, by simply equating them with extra entries in the + command lookup table. + + IMPLEMENTATION: + A User-FTP can access a server that supports only the + "X" forms by implementing a mode switch, or + automatically using the following procedure: if the + RFC-959 form of one of the above commands is rejected + with a 500 or 502 response code, then try the + experimental form; any other response would be passed + to the user. + + + +Internet Engineering Task Force [Page 35] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + 4.1.3.2 Idle Timeout + + A Server-FTP process SHOULD have an idle timeout, which will + terminate the process and close the control connection if + the server is inactive (i.e., no command or data transfer in + progress) for a long period of time. The idle timeout time + SHOULD be configurable, and the default should be at least 5 + minutes. + + A client FTP process ("User-PI" in RFC-959) will need + timeouts on responses only if it is invoked from a program. + + DISCUSSION: + Without a timeout, a Server-FTP process may be left + pending indefinitely if the corresponding client + crashes without closing the control connection. + + 4.1.3.3 Concurrency of Data and Control + + DISCUSSION: + The intent of the designers of FTP was that a user + should be able to send a STAT command at any time while + data transfer was in progress and that the server-FTP + would reply immediately with status -- e.g., the number + of bytes transferred so far. Similarly, an ABOR + command should be possible at any time during a data + transfer. + + Unfortunately, some small-machine operating systems + make such concurrent programming difficult, and some + other implementers seek minimal solutions, so some FTP + implementations do not allow concurrent use of the data + and control connections. Even such a minimal server + must be prepared to accept and defer a STAT or ABOR + command that arrives during data transfer. + + 4.1.3.4 FTP Restart Mechanism + + The description of the 110 reply on pp. 40-41 of RFC-959 is + incorrect; the correct description is as follows. A restart + reply message, sent over the control connection from the + receiving FTP to the User-FTP, has the format: + + 110 MARK ssss = rrrr + + Here: + + * ssss is a text string that appeared in a Restart Marker + + + +Internet Engineering Task Force [Page 36] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + in the data stream and encodes a position in the + sender's file system; + + * rrrr encodes the corresponding position in the + receiver's file system. + + The encoding, which is specific to a particular file system + and network implementation, is always generated and + interpreted by the same system, either sender or receiver. + + When an FTP that implements restart receives a Restart + Marker in the data stream, it SHOULD force the data to that + point to be written to stable storage before encoding the + corresponding position rrrr. An FTP sending Restart Markers + MUST NOT assume that 110 replies will be returned + synchronously with the data, i.e., it must not await a 110 + reply before sending more data. + + Two new reply codes are hereby defined for errors + encountered in restarting a transfer: + + 554 Requested action not taken: invalid REST parameter. + + A 554 reply may result from a FTP service command that + follows a REST command. The reply indicates that the + existing file at the Server-FTP cannot be repositioned + as specified in the REST. + + 555 Requested action not taken: type or stru mismatch. + + A 555 reply may result from an APPE command or from any + FTP service command following a REST command. The + reply indicates that there is some mismatch between the + current transfer parameters (type and stru) and the + attributes of the existing file. + + DISCUSSION: + Note that the FTP Restart mechanism requires that Block + or Compressed mode be used for data transfer, to allow + the Restart Markers to be included within the data + stream. The frequency of Restart Markers can be low. + + Restart Markers mark a place in the data stream, but + the receiver may be performing some transformation on + the data as it is stored into stable storage. In + general, the receiver's encoding must include any state + information necessary to restart this transformation at + any point of the FTP data stream. For example, in TYPE + + + +Internet Engineering Task Force [Page 37] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + A transfers, some receiver hosts transform CR LF + sequences into a single LF character on disk. If a + Restart Marker happens to fall between CR and LF, the + receiver must encode in rrrr that the transfer must be + restarted in a "CR has been seen and discarded" state. + + Note that the Restart Marker is required to be encoded + as a string of printable ASCII characters, regardless + of the type of the data. + + RFC-959 says that restart information is to be returned + "to the user". This should not be taken literally. In + general, the User-FTP should save the restart + information (ssss,rrrr) in stable storage, e.g., append + it to a restart control file. An empty restart control + file should be created when the transfer first starts + and deleted automatically when the transfer completes + successfully. It is suggested that this file have a + name derived in an easily-identifiable manner from the + name of the file being transferred and the remote host + name; this is analogous to the means used by many text + editors for naming "backup" files. + + There are three cases for FTP restart. + + (1) User-to-Server Transfer + + The User-FTP puts Restart Markers at + convenient places in the data stream. When the + Server-FTP receives a Marker, it writes all prior + data to disk, encodes its file system position and + transformation state as rrrr, and returns a "110 + MARK ssss = rrrr" reply over the control + connection. The User-FTP appends the pair + (ssss,rrrr) to its restart control file. + + To restart the transfer, the User-FTP fetches the + last (ssss,rrrr) pair from the restart control + file, repositions its local file system and + transformation state using ssss, and sends the + command "REST rrrr" to the Server-FTP. + + (2) Server-to-User Transfer + + The Server-FTP puts Restart Markers at + convenient places in the data stream. When the + User-FTP receives a Marker, it writes all prior + data to disk, encodes its file system position and + + + +Internet Engineering Task Force [Page 38] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + transformation state as rrrr, and appends the pair + (rrrr,ssss) to its restart control file. + + To restart the transfer, the User-FTP fetches the + last (rrrr,ssss) pair from the restart control + file, repositions its local file system and + transformation state using rrrr, and sends the + command "REST ssss" to the Server-FTP. + + (3) Server-to-Server ("Third-Party") Transfer + + The sending Server-FTP puts Restart Markers + at convenient places in the data stream. When it + receives a Marker, the receiving Server-FTP writes + all prior data to disk, encodes its file system + position and transformation state as rrrr, and + sends a "110 MARK ssss = rrrr" reply over the + control connection to the User. The User-FTP + appends the pair (ssss,rrrr) to its restart + control file. + + To restart the transfer, the User-FTP fetches the + last (ssss,rrrr) pair from the restart control + file, sends "REST ssss" to the sending Server-FTP, + and sends "REST rrrr" to the receiving Server-FTP. + + + 4.1.4 FTP/USER INTERFACE + + This section discusses the user interface for a User-FTP + program. + + 4.1.4.1 Pathname Specification + + Since FTP is intended for use in a heterogeneous + environment, User-FTP implementations MUST support remote + pathnames as arbitrary character strings, so that their form + and content are not limited by the conventions of the local + operating system. + + DISCUSSION: + In particular, remote pathnames can be of arbitrary + length, and all the printing ASCII characters as well + as space (0x20) must be allowed. RFC-959 allows a + pathname to contain any 7-bit ASCII character except CR + or LF. + + + + + +Internet Engineering Task Force [Page 39] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + 4.1.4.2 "QUOTE" Command + + A User-FTP program MUST implement a "QUOTE" command that + will pass an arbitrary character string to the server and + display all resulting response messages to the user. + + To make the "QUOTE" command useful, a User-FTP SHOULD send + transfer control commands to the server as the user enters + them, rather than saving all the commands and sending them + to the server only when a data transfer is started. + + DISCUSSION: + The "QUOTE" command is essential to allow the user to + access servers that require system-specific commands + (e.g., SITE or ALLO), or to invoke new or optional + features that are not implemented by the User-FTP. For + example, "QUOTE" may be used to specify "TYPE A T" to + send a print file to hosts that require the + distinction, even if the User-FTP does not recognize + that TYPE. + + 4.1.4.3 Displaying Replies to User + + A User-FTP SHOULD display to the user the full text of all + error reply messages it receives. It SHOULD have a + "verbose" mode in which all commands it sends and the full + text and reply codes it receives are displayed, for + diagnosis of problems. + + 4.1.4.4 Maintaining Synchronization + + The state machine in a User-FTP SHOULD be forgiving of + missing and unexpected reply messages, in order to maintain + command synchronization with the server. + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 40] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + 4.1.5 FTP REQUIREMENTS SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-------------------------------------------|---------------|-|-|-|-|-|-- +Implement TYPE T if same as TYPE N |4.1.2.2 | |x| | | | +File/Record transform invertible if poss. |4.1.2.4 | |x| | | | +User-FTP send PORT cmd for stream mode |4.1.2.5 | |x| | | | +Server-FTP implement PASV |4.1.2.6 |x| | | | | + PASV is per-transfer |4.1.2.6 |x| | | | | +NLST reply usable in RETR cmds |4.1.2.7 |x| | | | | +Implied type for LIST and NLST |4.1.2.7 | |x| | | | +SITE cmd for non-standard features |4.1.2.8 | |x| | | | +STOU cmd return pathname as specified |4.1.2.9 |x| | | | | +Use TCP READ boundaries on control conn. |4.1.2.10 | | | | |x| + | | | | | | | +Server-FTP send only correct reply format |4.1.2.11 |x| | | | | +Server-FTP use defined reply code if poss. |4.1.2.11 | |x| | | | + New reply code following Section 4.2 |4.1.2.11 | | |x| | | +User-FTP use only high digit of reply |4.1.2.11 | |x| | | | +User-FTP handle multi-line reply lines |4.1.2.11 |x| | | | | +User-FTP handle 421 reply specially |4.1.2.11 | | | |x| | + | | | | | | | +Default data port same IP addr as ctl conn |4.1.2.12 |x| | | | | +User-FTP send Telnet cmds exc. SYNCH, IP |4.1.2.12 | | | | |x| +User-FTP negotiate Telnet options |4.1.2.12 | | | | |x| +Server-FTP handle Telnet options |4.1.2.12 |x| | | | | +Handle "Experimental" directory cmds |4.1.3.1 | |x| | | | +Idle timeout in server-FTP |4.1.3.2 | |x| | | | + Configurable idle timeout |4.1.3.2 | |x| | | | +Receiver checkpoint data at Restart Marker |4.1.3.4 | |x| | | | +Sender assume 110 replies are synchronous |4.1.3.4 | | | | |x| + | | | | | | | +Support TYPE: | | | | | | | + ASCII - Non-Print (AN) |4.1.2.13 |x| | | | | + ASCII - Telnet (AT) -- if same as AN |4.1.2.2 | |x| | | | + ASCII - Carriage Control (AC) |959 3.1.1.5.2 | | |x| | | + EBCDIC - (any form) |959 3.1.1.2 | | |x| | | + IMAGE |4.1.2.1 |x| | | | | + LOCAL 8 |4.1.2.1 |x| | | | | + + + +Internet Engineering Task Force [Page 41] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + + LOCAL m |4.1.2.1 | | |x| | |2 + | | | | | | | +Support MODE: | | | | | | | + Stream |4.1.2.13 |x| | | | | + Block |959 3.4.2 | | |x| | | + | | | | | | | +Support STRUCTURE: | | | | | | | + File |4.1.2.13 |x| | | | | + Record |4.1.2.13 |x| | | | |3 + Page |4.1.2.3 | | | |x| | + | | | | | | | +Support commands: | | | | | | | + USER |4.1.2.13 |x| | | | | + PASS |4.1.2.13 |x| | | | | + ACCT |4.1.2.13 |x| | | | | + CWD |4.1.2.13 |x| | | | | + CDUP |4.1.2.13 |x| | | | | + SMNT |959 5.3.1 | | |x| | | + REIN |959 5.3.1 | | |x| | | + QUIT |4.1.2.13 |x| | | | | + | | | | | | | + PORT |4.1.2.13 |x| | | | | + PASV |4.1.2.6 |x| | | | | + TYPE |4.1.2.13 |x| | | | |1 + STRU |4.1.2.13 |x| | | | |1 + MODE |4.1.2.13 |x| | | | |1 + | | | | | | | + RETR |4.1.2.13 |x| | | | | + STOR |4.1.2.13 |x| | | | | + STOU |959 5.3.1 | | |x| | | + APPE |4.1.2.13 |x| | | | | + ALLO |959 5.3.1 | | |x| | | + REST |959 5.3.1 | | |x| | | + RNFR |4.1.2.13 |x| | | | | + RNTO |4.1.2.13 |x| | | | | + ABOR |959 5.3.1 | | |x| | | + DELE |4.1.2.13 |x| | | | | + RMD |4.1.2.13 |x| | | | | + MKD |4.1.2.13 |x| | | | | + PWD |4.1.2.13 |x| | | | | + LIST |4.1.2.13 |x| | | | | + NLST |4.1.2.13 |x| | | | | + SITE |4.1.2.8 | | |x| | | + STAT |4.1.2.13 |x| | | | | + SYST |4.1.2.13 |x| | | | | + HELP |4.1.2.13 |x| | | | | + NOOP |4.1.2.13 |x| | | | | + | | | | | | | + + + +Internet Engineering Task Force [Page 42] + + + + +RFC1123 FILE TRANSFER -- FTP October 1989 + + +User Interface: | | | | | | | + Arbitrary pathnames |4.1.4.1 |x| | | | | + Implement "QUOTE" command |4.1.4.2 |x| | | | | + Transfer control commands immediately |4.1.4.2 | |x| | | | + Display error messages to user |4.1.4.3 | |x| | | | + Verbose mode |4.1.4.3 | |x| | | | + Maintain synchronization with server |4.1.4.4 | |x| | | | + +Footnotes: + +(1) For the values shown earlier. + +(2) Here m is number of bits in a memory word. + +(3) Required for host with record-structured file system, optional + otherwise. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 43] + + + + +RFC1123 FILE TRANSFER -- TFTP October 1989 + + + 4.2 TRIVIAL FILE TRANSFER PROTOCOL -- TFTP + + 4.2.1 INTRODUCTION + + The Trivial File Transfer Protocol TFTP is defined in RFC-783 + [TFTP:1]. + + TFTP provides its own reliable delivery with UDP as its + transport protocol, using a simple stop-and-wait acknowledgment + system. Since TFTP has an effective window of only one 512 + octet segment, it can provide good performance only over paths + that have a small delay*bandwidth product. The TFTP file + interface is very simple, providing no access control or + security. + + TFTP's most important application is bootstrapping a host over + a local network, since it is simple and small enough to be + easily implemented in EPROM [BOOT:1, BOOT:2]. Vendors are + urged to support TFTP for booting. + + 4.2.2 PROTOCOL WALK-THROUGH + + The TFTP specification [TFTP:1] is written in an open style, + and does not fully specify many parts of the protocol. + + 4.2.2.1 Transfer Modes: RFC-783, Page 3 + + The transfer mode "mail" SHOULD NOT be supported. + + 4.2.2.2 UDP Header: RFC-783, Page 17 + + The Length field of a UDP header is incorrectly defined; it + includes the UDP header length (8). + + 4.2.3 SPECIFIC ISSUES + + 4.2.3.1 Sorcerer's Apprentice Syndrome + + There is a serious bug, known as the "Sorcerer's Apprentice + Syndrome," in the protocol specification. While it does not + cause incorrect operation of the transfer (the file will + always be transferred correctly if the transfer completes), + this bug may cause excessive retransmission, which may cause + the transfer to time out. + + Implementations MUST contain the fix for this problem: the + sender (i.e., the side originating the DATA packets) must + never resend the current DATA packet on receipt of a + + + +Internet Engineering Task Force [Page 44] + + + + +RFC1123 FILE TRANSFER -- TFTP October 1989 + + + duplicate ACK. + + DISCUSSION: + The bug is caused by the protocol rule that either + side, on receiving an old duplicate datagram, may + resend the current datagram. If a packet is delayed in + the network but later successfully delivered after + either side has timed out and retransmitted a packet, a + duplicate copy of the response may be generated. If + the other side responds to this duplicate with a + duplicate of its own, then every datagram will be sent + in duplicate for the remainder of the transfer (unless + a datagram is lost, breaking the repetition). Worse + yet, since the delay is often caused by congestion, + this duplicate transmission will usually causes more + congestion, leading to more delayed packets, etc. + + The following example may help to clarify this problem. + + TFTP A TFTP B + + (1) Receive ACK X-1 + Send DATA X + (2) Receive DATA X + Send ACK X + (ACK X is delayed in network, + and A times out): + (3) Retransmit DATA X + + (4) Receive DATA X again + Send ACK X again + (5) Receive (delayed) ACK X + Send DATA X+1 + (6) Receive DATA X+1 + Send ACK X+1 + (7) Receive ACK X again + Send DATA X+1 again + (8) Receive DATA X+1 again + Send ACK X+1 again + (9) Receive ACK X+1 + Send DATA X+2 + (10) Receive DATA X+2 + Send ACK X+3 + (11) Receive ACK X+1 again + Send DATA X+2 again + (12) Receive DATA X+2 again + Send ACK X+3 again + + + + +Internet Engineering Task Force [Page 45] + + + + +RFC1123 FILE TRANSFER -- TFTP October 1989 + + + Notice that once the delayed ACK arrives, the protocol + settles down to duplicate all further packets + (sequences 5-8 and 9-12). The problem is caused not by + either side timing out, but by both sides + retransmitting the current packet when they receive a + duplicate. + + The fix is to break the retransmission loop, as + indicated above. This is analogous to the behavior of + TCP. It is then possible to remove the retransmission + timer on the receiver, since the resent ACK will never + cause any action; this is a useful simplification where + TFTP is used in a bootstrap program. It is OK to allow + the timer to remain, and it may be helpful if the + retransmitted ACK replaces one that was genuinely lost + in the network. The sender still requires a retransmit + timer, of course. + + 4.2.3.2 Timeout Algorithms + + A TFTP implementation MUST use an adaptive timeout. + + IMPLEMENTATION: + TCP retransmission algorithms provide a useful base to + work from. At least an exponential backoff of + retransmission timeout is necessary. + + 4.2.3.3 Extensions + + A variety of non-standard extensions have been made to TFTP, + including additional transfer modes and a secure operation + mode (with passwords). None of these have been + standardized. + + 4.2.3.4 Access Control + + A server TFTP implementation SHOULD include some + configurable access control over what pathnames are allowed + in TFTP operations. + + 4.2.3.5 Broadcast Request + + A TFTP request directed to a broadcast address SHOULD be + silently ignored. + + DISCUSSION: + Due to the weak access control capability of TFTP, + directed broadcasts of TFTP requests to random networks + + + +Internet Engineering Task Force [Page 46] + + + + +RFC1123 FILE TRANSFER -- TFTP October 1989 + + + could create a significant security hole. + + 4.2.4 TFTP REQUIREMENTS SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-------------------------------------------------|--------|-|-|-|-|-|-- +Fix Sorcerer's Apprentice Syndrome |4.2.3.1 |x| | | | | +Transfer modes: | | | | | | | + netascii |RFC-783 |x| | | | | + octet |RFC-783 |x| | | | | + mail |4.2.2.1 | | | |x| | + extensions |4.2.3.3 | | |x| | | +Use adaptive timeout |4.2.3.2 |x| | | | | +Configurable access control |4.2.3.4 | |x| | | | +Silently ignore broadcast request |4.2.3.5 | |x| | | | +-------------------------------------------------|--------|-|-|-|-|-|-- +-------------------------------------------------|--------|-|-|-|-|-|-- + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 47] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + +5. ELECTRONIC MAIL -- SMTP and RFC-822 + + 5.1 INTRODUCTION + + In the TCP/IP protocol suite, electronic mail in a format + specified in RFC-822 [SMTP:2] is transmitted using the Simple Mail + Transfer Protocol (SMTP) defined in RFC-821 [SMTP:1]. + + While SMTP has remained unchanged over the years, the Internet + community has made several changes in the way SMTP is used. In + particular, the conversion to the Domain Name System (DNS) has + caused changes in address formats and in mail routing. In this + section, we assume familiarity with the concepts and terminology + of the DNS, whose requirements are given in Section 6.1. + + RFC-822 specifies the Internet standard format for electronic mail + messages. RFC-822 supercedes an older standard, RFC-733, that may + still be in use in a few places, although it is obsolete. The two + formats are sometimes referred to simply by number ("822" and + "733"). + + RFC-822 is used in some non-Internet mail environments with + different mail transfer protocols than SMTP, and SMTP has also + been adapted for use in some non-Internet environments. Note that + this document presents the rules for the use of SMTP and RFC-822 + for the Internet environment only; other mail environments that + use these protocols may be expected to have their own rules. + + 5.2 PROTOCOL WALK-THROUGH + + This section covers both RFC-821 and RFC-822. + + The SMTP specification in RFC-821 is clear and contains numerous + examples, so implementors should not find it difficult to + understand. This section simply updates or annotates portions of + RFC-821 to conform with current usage. + + RFC-822 is a long and dense document, defining a rich syntax. + Unfortunately, incomplete or defective implementations of RFC-822 + are common. In fact, nearly all of the many formats of RFC-822 + are actually used, so an implementation generally needs to + recognize and correctly interpret all of the RFC-822 syntax. + + 5.2.1 The SMTP Model: RFC-821 Section 2 + + DISCUSSION: + Mail is sent by a series of request/response transactions + between a client, the "sender-SMTP," and a server, the + + + +Internet Engineering Task Force [Page 48] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + "receiver-SMTP". These transactions pass (1) the message + proper, which is composed of header and body, and (2) SMTP + source and destination addresses, referred to as the + "envelope". + + The SMTP programs are analogous to Message Transfer Agents + (MTAs) of X.400. There will be another level of protocol + software, closer to the end user, that is responsible for + composing and analyzing RFC-822 message headers; this + component is known as the "User Agent" in X.400, and we + use that term in this document. There is a clear logical + distinction between the User Agent and the SMTP + implementation, since they operate on different levels of + protocol. Note, however, that this distinction is may not + be exactly reflected the structure of typical + implementations of Internet mail. Often there is a + program known as the "mailer" that implements SMTP and + also some of the User Agent functions; the rest of the + User Agent functions are included in a user interface used + for entering and reading mail. + + The SMTP envelope is constructed at the originating site, + typically by the User Agent when the message is first + queued for the Sender-SMTP program. The envelope + addresses may be derived from information in the message + header, supplied by the user interface (e.g., to implement + a bcc: request), or derived from local configuration + information (e.g., expansion of a mailing list). The SMTP + envelope cannot in general be re-derived from the header + at a later stage in message delivery, so the envelope is + transmitted separately from the message itself using the + MAIL and RCPT commands of SMTP. + + The text of RFC-821 suggests that mail is to be delivered + to an individual user at a host. With the advent of the + domain system and of mail routing using mail-exchange (MX) + resource records, implementors should now think of + delivering mail to a user at a domain, which may or may + not be a particular host. This DOES NOT change the fact + that SMTP is a host-to-host mail exchange protocol. + + 5.2.2 Canonicalization: RFC-821 Section 3.1 + + The domain names that a Sender-SMTP sends in MAIL and RCPT + commands MUST have been "canonicalized," i.e., they must be + fully-qualified principal names or domain literals, not + nicknames or domain abbreviations. A canonicalized name either + identifies a host directly or is an MX name; it cannot be a + + + +Internet Engineering Task Force [Page 49] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + CNAME. + + 5.2.3 VRFY and EXPN Commands: RFC-821 Section 3.3 + + A receiver-SMTP MUST implement VRFY and SHOULD implement EXPN + (this requirement overrides RFC-821). However, there MAY be + configuration information to disable VRFY and EXPN in a + particular installation; this might even allow EXPN to be + disabled for selected lists. + + A new reply code is defined for the VRFY command: + + 252 Cannot VRFY user (e.g., info is not local), but will + take message for this user and attempt delivery. + + DISCUSSION: + SMTP users and administrators make regular use of these + commands for diagnosing mail delivery problems. With the + increasing use of multi-level mailing list expansion + (sometimes more than two levels), EXPN has been + increasingly important for diagnosing inadvertent mail + loops. On the other hand, some feel that EXPN represents + a significant privacy, and perhaps even a security, + exposure. + + 5.2.4 SEND, SOML, and SAML Commands: RFC-821 Section 3.4 + + An SMTP MAY implement the commands to send a message to a + user's terminal: SEND, SOML, and SAML. + + DISCUSSION: + It has been suggested that the use of mail relaying + through an MX record is inconsistent with the intent of + SEND to deliver a message immediately and directly to a + user's terminal. However, an SMTP receiver that is unable + to write directly to the user terminal can return a "251 + User Not Local" reply to the RCPT following a SEND, to + inform the originator of possibly deferred delivery. + + 5.2.5 HELO Command: RFC-821 Section 3.5 + + The sender-SMTP MUST ensure that the parameter in a + HELO command is a valid principal host domain name for the + client host. As a result, the receiver-SMTP will not have to + perform MX resolution on this name in order to validate the + HELO parameter. + + The HELO receiver MAY verify that the HELO parameter really + + + +Internet Engineering Task Force [Page 50] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + corresponds to the IP address of the sender. However, the + receiver MUST NOT refuse to accept a message, even if the + sender's HELO command fails verification. + + DISCUSSION: + Verifying the HELO parameter requires a domain name lookup + and may therefore take considerable time. An alternative + tool for tracking bogus mail sources is suggested below + (see "DATA Command"). + + Note also that the HELO argument is still required to have + valid syntax, since it will appear in a Received: + line; otherwise, a 501 error is to be sent. + + IMPLEMENTATION: + When HELO parameter validation fails, a suggested + procedure is to insert a note about the unknown + authenticity of the sender into the message header (e.g., + in the "Received:" line). + + 5.2.6 Mail Relay: RFC-821 Section 3.6 + + We distinguish three types of mail (store-and-) forwarding: + + (1) A simple forwarder or "mail exchanger" forwards a message + using private knowledge about the recipient; see section + 3.2 of RFC-821. + + (2) An SMTP mail "relay" forwards a message within an SMTP + mail environment as the result of an explicit source route + (as defined in section 3.6 of RFC-821). The SMTP relay + function uses the "@...:" form of source route from RFC- + 822 (see Section 5.2.19 below). + + (3) A mail "gateway" passes a message between different + environments. The rules for mail gateways are discussed + below in Section 5.3.7. + + An Internet host that is forwarding a message but is not a + gateway to a different mail environment (i.e., it falls under + (1) or (2)) SHOULD NOT alter any existing header fields, + although the host will add an appropriate Received: line as + required in Section 5.2.8. + + A Sender-SMTP SHOULD NOT send a RCPT TO: command containing an + explicit source route using the "@...:" address form. Thus, + the relay function defined in section 3.6 of RFC-821 should + not be used. + + + +Internet Engineering Task Force [Page 51] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + DISCUSSION: + The intent is to discourage all source routing and to + abolish explicit source routing for mail delivery within + the Internet environment. Source-routing is unnecessary; + the simple target address "user@domain" should always + suffice. This is the result of an explicit architectural + decision to use universal naming rather than source + routing for mail. Thus, SMTP provides end-to-end + connectivity, and the DNS provides globally-unique, + location-independent names. MX records handle the major + case where source routing might otherwise be needed. + + A receiver-SMTP MUST accept the explicit source route syntax in + the envelope, but it MAY implement the relay function as + defined in section 3.6 of RFC-821. If it does not implement + the relay function, it SHOULD attempt to deliver the message + directly to the host to the right of the right-most "@" sign. + + DISCUSSION: + For example, suppose a host that does not implement the + relay function receives a message with the SMTP command: + "RCPT TO:<@ALPHA,@BETA:joe@GAMMA>", where ALPHA, BETA, and + GAMMA represent domain names. Rather than immediately + refusing the message with a 550 error reply as suggested + on page 20 of RFC-821, the host should try to forward the + message to GAMMA directly, using: "RCPT TO:". + Since this host does not support relaying, it is not + required to update the reverse path. + + Some have suggested that source routing may be needed + occasionally for manually routing mail around failures; + however, the reality and importance of this need is + controversial. The use of explicit SMTP mail relaying for + this purpose is discouraged, and in fact it may not be + successful, as many host systems do not support it. Some + have used the "%-hack" (see Section 5.2.16) for this + purpose. + + 5.2.7 RCPT Command: RFC-821 Section 4.1.1 + + A host that supports a receiver-SMTP MUST support the reserved + mailbox "Postmaster". + + The receiver-SMTP MAY verify RCPT parameters as they arrive; + however, RCPT responses MUST NOT be delayed beyond a reasonable + time (see Section 5.3.2). + + Therefore, a "250 OK" response to a RCPT does not necessarily + + + +Internet Engineering Task Force [Page 52] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + imply that the delivery address(es) are valid. Errors found + after message acceptance will be reported by mailing a + notification message to an appropriate address (see Section + 5.3.3). + + DISCUSSION: + The set of conditions under which a RCPT parameter can be + validated immediately is an engineering design choice. + Reporting destination mailbox errors to the Sender-SMTP + before mail is transferred is generally desirable to save + time and network bandwidth, but this advantage is lost if + RCPT verification is lengthy. + + For example, the receiver can verify immediately any + simple local reference, such as a single locally- + registered mailbox. On the other hand, the "reasonable + time" limitation generally implies deferring verification + of a mailing list until after the message has been + transferred and accepted, since verifying a large mailing + list can take a very long time. An implementation might + or might not choose to defer validation of addresses that + are non-local and therefore require a DNS lookup. If a + DNS lookup is performed but a soft domain system error + (e.g., timeout) occurs, validity must be assumed. + + 5.2.8 DATA Command: RFC-821 Section 4.1.1 + + Every receiver-SMTP (not just one that "accepts a message for + relaying or for final delivery" [SMTP:1]) MUST insert a + "Received:" line at the beginning of a message. In this line, + called a "time stamp line" in RFC-821: + + * The FROM field SHOULD contain both (1) the name of the + source host as presented in the HELO command and (2) a + domain literal containing the IP address of the source, + determined from the TCP connection. + + * The ID field MAY contain an "@" as suggested in RFC-822, + but this is not required. + + * The FOR field MAY contain a list of entries when + multiple RCPT commands have been given. + + + An Internet mail program MUST NOT change a Received: line that + was previously added to the message header. + + + + + +Internet Engineering Task Force [Page 53] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + DISCUSSION: + Including both the source host and the IP source address + in the Received: line may provide enough information for + tracking illicit mail sources and eliminate a need to + explicitly verify the HELO parameter. + + Received: lines are primarily intended for humans tracing + mail routes, primarily of diagnosis of faults. See also + the discussion under 5.3.7. + + When the receiver-SMTP makes "final delivery" of a message, + then it MUST pass the MAIL FROM: address from the SMTP envelope + with the message, for use if an error notification message must + be sent later (see Section 5.3.3). There is an analogous + requirement when gatewaying from the Internet into a different + mail environment; see Section 5.3.7. + + DISCUSSION: + Note that the final reply to the DATA command depends only + upon the successful transfer and storage of the message. + Any problem with the destination address(es) must either + (1) have been reported in an SMTP error reply to the RCPT + command(s), or (2) be reported in a later error message + mailed to the originator. + + IMPLEMENTATION: + The MAIL FROM: information may be passed as a parameter or + in a Return-Path: line inserted at the beginning of the + message. + + 5.2.9 Command Syntax: RFC-821 Section 4.1.2 + + The syntax shown in RFC-821 for the MAIL FROM: command omits + the case of an empty path: "MAIL FROM: <>" (see RFC-821 Page + 15). An empty reverse path MUST be supported. + + 5.2.10 SMTP Replies: RFC-821 Section 4.2 + + A receiver-SMTP SHOULD send only the reply codes listed in + section 4.2.2 of RFC-821 or in this document. A receiver-SMTP + SHOULD use the text shown in examples in RFC-821 whenever + appropriate. + + A sender-SMTP MUST determine its actions only by the reply + code, not by the text (except for 251 and 551 replies); any + text, including no text at all, must be acceptable. The space + (blank) following the reply code is considered part of the + text. Whenever possible, a sender-SMTP SHOULD test only the + + + +Internet Engineering Task Force [Page 54] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + first digit of the reply code, as specified in Appendix E of + RFC-821. + + DISCUSSION: + Interoperability problems have arisen with SMTP systems + using reply codes that are not listed explicitly in RFC- + 821 Section 4.3 but are legal according to the theory of + reply codes explained in Appendix E. + + 5.2.11 Transparency: RFC-821 Section 4.5.2 + + Implementors MUST be sure that their mail systems always add + and delete periods to ensure message transparency. + + 5.2.12 WKS Use in MX Processing: RFC-974, p. 5 + + RFC-974 [SMTP:3] recommended that the domain system be queried + for WKS ("Well-Known Service") records, to verify that each + proposed mail target does support SMTP. Later experience has + shown that WKS is not widely supported, so the WKS step in MX + processing SHOULD NOT be used. + + The following are notes on RFC-822, organized by section of that + document. + + 5.2.13 RFC-822 Message Specification: RFC-822 Section 4 + + The syntax shown for the Return-path line omits the possibility + of a null return path, which is used to prevent looping of + error notifications (see Section 5.3.3). The complete syntax + is: + + return = "Return-path" ":" route-addr + / "Return-path" ":" "<" ">" + + The set of optional header fields is hereby expanded to include + the Content-Type field defined in RFC-1049 [SMTP:7]. This + field "allows mail reading systems to automatically identify + the type of a structured message body and to process it for + display accordingly". [SMTP:7] A User Agent MAY support this + field. + + 5.2.14 RFC-822 Date and Time Specification: RFC-822 Section 5 + + The syntax for the date is hereby changed to: + + date = 1*2DIGIT month 2*4DIGIT + + + + +Internet Engineering Task Force [Page 55] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + All mail software SHOULD use 4-digit years in dates, to ease + the transition to the next century. + + There is a strong trend towards the use of numeric timezone + indicators, and implementations SHOULD use numeric timezones + instead of timezone names. However, all implementations MUST + accept either notation. If timezone names are used, they MUST + be exactly as defined in RFC-822. + + The military time zones are specified incorrectly in RFC-822: + they count the wrong way from UT (the signs are reversed). As + a result, military time zones in RFC-822 headers carry no + information. + + Finally, note that there is a typo in the definition of "zone" + in the syntax summary of appendix D; the correct definition + occurs in Section 3 of RFC-822. + + 5.2.15 RFC-822 Syntax Change: RFC-822 Section 6.1 + + The syntactic definition of "mailbox" in RFC-822 is hereby + changed to: + + mailbox = addr-spec ; simple address + / [phrase] route-addr ; name & addr-spec + + That is, the phrase preceding a route address is now OPTIONAL. + This change makes the following header field legal, for + example: + + From: + + 5.2.16 RFC-822 Local-part: RFC-822 Section 6.2 + + The basic mailbox address specification has the form: "local- + part@domain". Here "local-part", sometimes called the "left- + hand side" of the address, is domain-dependent. + + A host that is forwarding the message but is not the + destination host implied by the right-hand side "domain" MUST + NOT interpret or modify the "local-part" of the address. + + When mail is to be gatewayed from the Internet mail environment + into a foreign mail environment (see Section 5.3.7), routing + information for that foreign environment MAY be embedded within + the "local-part" of the address. The gateway will then + interpret this local part appropriately for the foreign mail + environment. + + + +Internet Engineering Task Force [Page 56] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + DISCUSSION: + Although source routes are discouraged within the Internet + (see Section 5.2.6), there are non-Internet mail + environments whose delivery mechanisms do depend upon + source routes. Source routes for extra-Internet + environments can generally be buried in the "local-part" + of the address (see Section 5.2.16) while mail traverses + the Internet. When the mail reaches the appropriate + Internet mail gateway, the gateway will interpret the + local-part and build the necessary address or route for + the target mail environment. + + For example, an Internet host might send mail to: + "a!b!c!user@gateway-domain". The complex local part + "a!b!c!user" would be uninterpreted within the Internet + domain, but could be parsed and understood by the + specified mail gateway. + + An embedded source route is sometimes encoded in the + "local-part" using "%" as a right-binding routing + operator. For example, in: + + user%domain%relay3%relay2@relay1 + + the "%" convention implies that the mail is to be routed + from "relay1" through "relay2", "relay3", and finally to + "user" at "domain". This is commonly known as the "%- + hack". It is suggested that "%" have lower precedence + than any other routing operator (e.g., "!") hidden in the + local-part; for example, "a!b%c" would be interpreted as + "(a!b)%c". + + Only the target host (in this case, "relay1") is permitted + to analyze the local-part "user%domain%relay3%relay2". + + 5.2.17 Domain Literals: RFC-822 Section 6.2.3 + + A mailer MUST be able to accept and parse an Internet domain + literal whose content ("dtext"; see RFC-822) is a dotted- + decimal host address. This satisfies the requirement of + Section 2.1 for the case of mail. + + An SMTP MUST accept and recognize a domain literal for any of + its own IP addresses. + + + + + + + +Internet Engineering Task Force [Page 57] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + 5.2.18 Common Address Formatting Errors: RFC-822 Section 6.1 + + Errors in formatting or parsing 822 addresses are unfortunately + common. This section mentions only the most common errors. A + User Agent MUST accept all valid RFC-822 address formats, and + MUST NOT generate illegal address syntax. + + o A common error is to leave out the semicolon after a group + identifier. + + o Some systems fail to fully-qualify domain names in + messages they generate. The right-hand side of an "@" + sign in a header address field MUST be a fully-qualified + domain name. + + For example, some systems fail to fully-qualify the From: + address; this prevents a "reply" command in the user + interface from automatically constructing a return + address. + + DISCUSSION: + Although RFC-822 allows the local use of abbreviated + domain names within a domain, the application of + RFC-822 in Internet mail does not allow this. The + intent is that an Internet host must not send an SMTP + message header containing an abbreviated domain name + in an address field. This allows the address fields + of the header to be passed without alteration across + the Internet, as required in Section 5.2.6. + + o Some systems mis-parse multiple-hop explicit source routes + such as: + + @relay1,@relay2,@relay3:user@domain. + + + o Some systems over-qualify domain names by adding a + trailing dot to some or all domain names in addresses or + message-ids. This violates RFC-822 syntax. + + + 5.2.19 Explicit Source Routes: RFC-822 Section 6.2.7 + + Internet host software SHOULD NOT create an RFC-822 header + containing an address with an explicit source route, but MUST + accept such headers for compatibility with earlier systems. + + DISCUSSION: + + + +Internet Engineering Task Force [Page 58] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + In an understatement, RFC-822 says "The use of explicit + source routing is discouraged". Many hosts implemented + RFC-822 source routes incorrectly, so the syntax cannot be + used unambiguously in practice. Many users feel the + syntax is ugly. Explicit source routes are not needed in + the mail envelope for delivery; see Section 5.2.6. For + all these reasons, explicit source routes using the RFC- + 822 notations are not to be used in Internet mail headers. + + As stated in Section 5.2.16, it is necessary to allow an + explicit source route to be buried in the local-part of an + address, e.g., using the "%-hack", in order to allow mail + to be gatewayed into another environment in which explicit + source routing is necessary. The vigilant will observe + that there is no way for a User Agent to detect and + prevent the use of such implicit source routing when the + destination is within the Internet. We can only + discourage source routing of any kind within the Internet, + as unnecessary and undesirable. + + 5.3 SPECIFIC ISSUES + + 5.3.1 SMTP Queueing Strategies + + The common structure of a host SMTP implementation includes + user mailboxes, one or more areas for queueing messages in + transit, and one or more daemon processes for sending and + receiving mail. The exact structure will vary depending on the + needs of the users on the host and the number and size of + mailing lists supported by the host. We describe several + optimizations that have proved helpful, particularly for + mailers supporting high traffic levels. + + Any queueing strategy MUST include: + + o Timeouts on all activities. See Section 5.3.2. + + o Never sending error messages in response to error + messages. + + + 5.3.1.1 Sending Strategy + + The general model of a sender-SMTP is one or more processes + that periodically attempt to transmit outgoing mail. In a + typical system, the program that composes a message has some + method for requesting immediate attention for a new piece of + outgoing mail, while mail that cannot be transmitted + + + +Internet Engineering Task Force [Page 59] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + immediately MUST be queued and periodically retried by the + sender. A mail queue entry will include not only the + message itself but also the envelope information. + + The sender MUST delay retrying a particular destination + after one attempt has failed. In general, the retry + interval SHOULD be at least 30 minutes; however, more + sophisticated and variable strategies will be beneficial + when the sender-SMTP can determine the reason for non- + delivery. + + Retries continue until the message is transmitted or the + sender gives up; the give-up time generally needs to be at + least 4-5 days. The parameters to the retry algorithm MUST + be configurable. + + A sender SHOULD keep a list of hosts it cannot reach and + corresponding timeouts, rather than just retrying queued + mail items. + + DISCUSSION: + Experience suggests that failures are typically + transient (the target system has crashed), favoring a + policy of two connection attempts in the first hour the + message is in the queue, and then backing off to once + every two or three hours. + + The sender-SMTP can shorten the queueing delay by + cooperation with the receiver-SMTP. In particular, if + mail is received from a particular address, it is good + evidence that any mail queued for that host can now be + sent. + + The strategy may be further modified as a result of + multiple addresses per host (see Section 5.3.4), to + optimize delivery time vs. resource usage. + + A sender-SMTP may have a large queue of messages for + each unavailable destination host, and if it retried + all these messages in every retry cycle, there would be + excessive Internet overhead and the daemon would be + blocked for a long period. Note that an SMTP can + generally determine that a delivery attempt has failed + only after a timeout of a minute or more; a one minute + timeout per connection will result in a very large + delay if it is repeated for dozens or even hundreds of + queued messages. + + + + +Internet Engineering Task Force [Page 60] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + When the same message is to be delivered to several users on + the same host, only one copy of the message SHOULD be + transmitted. That is, the sender-SMTP should use the + command sequence: RCPT, RCPT,... RCPT, DATA instead of the + sequence: RCPT, DATA, RCPT, DATA,... RCPT, DATA. + Implementation of this efficiency feature is strongly urged. + + Similarly, the sender-SMTP MAY support multiple concurrent + outgoing mail transactions to achieve timely delivery. + However, some limit SHOULD be imposed to protect the host + from devoting all its resources to mail. + + The use of the different addresses of a multihomed host is + discussed below. + + 5.3.1.2 Receiving strategy + + The receiver-SMTP SHOULD attempt to keep a pending listen on + the SMTP port at all times. This will require the support + of multiple incoming TCP connections for SMTP. Some limit + MAY be imposed. + + IMPLEMENTATION: + When the receiver-SMTP receives mail from a particular + host address, it could notify the sender-SMTP to retry + any mail pending for that host address. + + 5.3.2 Timeouts in SMTP + + There are two approaches to timeouts in the sender-SMTP: (a) + limit the time for each SMTP command separately, or (b) limit + the time for the entire SMTP dialogue for a single mail + message. A sender-SMTP SHOULD use option (a), per-command + timeouts. Timeouts SHOULD be easily reconfigurable, preferably + without recompiling the SMTP code. + + DISCUSSION: + Timeouts are an essential feature of an SMTP + implementation. If the timeouts are too long (or worse, + there are no timeouts), Internet communication failures or + software bugs in receiver-SMTP programs can tie up SMTP + processes indefinitely. If the timeouts are too short, + resources will be wasted with attempts that time out part + way through message delivery. + + If option (b) is used, the timeout has to be very large, + e.g., an hour, to allow time to expand very large mailing + lists. The timeout may also need to increase linearly + + + +Internet Engineering Task Force [Page 61] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + with the size of the message, to account for the time to + transmit a very large message. A large fixed timeout + leads to two problems: a failure can still tie up the + sender for a very long time, and very large messages may + still spuriously time out (which is a wasteful failure!). + + Using the recommended option (a), a timer is set for each + SMTP command and for each buffer of the data transfer. + The latter means that the overall timeout is inherently + proportional to the size of the message. + + Based on extensive experience with busy mail-relay hosts, the + minimum per-command timeout values SHOULD be as follows: + + o Initial 220 Message: 5 minutes + + A Sender-SMTP process needs to distinguish between a + failed TCP connection and a delay in receiving the initial + 220 greeting message. Many receiver-SMTPs will accept a + TCP connection but delay delivery of the 220 message until + their system load will permit more mail to be processed. + + o MAIL Command: 5 minutes + + + o RCPT Command: 5 minutes + + A longer timeout would be required if processing of + mailing lists and aliases were not deferred until after + the message was accepted. + + o DATA Initiation: 2 minutes + + This is while awaiting the "354 Start Input" reply to a + DATA command. + + o Data Block: 3 minutes + + This is while awaiting the completion of each TCP SEND + call transmitting a chunk of data. + + o DATA Termination: 10 minutes. + + This is while awaiting the "250 OK" reply. When the + receiver gets the final period terminating the message + data, it typically performs processing to deliver the + message to a user mailbox. A spurious timeout at this + point would be very wasteful, since the message has been + + + +Internet Engineering Task Force [Page 62] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + successfully sent. + + A receiver-SMTP SHOULD have a timeout of at least 5 minutes + while it is awaiting the next command from the sender. + + 5.3.3 Reliable Mail Receipt + + When the receiver-SMTP accepts a piece of mail (by sending a + "250 OK" message in response to DATA), it is accepting + responsibility for delivering or relaying the message. It must + take this responsibility seriously, i.e., it MUST NOT lose the + message for frivolous reasons, e.g., because the host later + crashes or because of a predictable resource shortage. + + If there is a delivery failure after acceptance of a message, + the receiver-SMTP MUST formulate and mail a notification + message. This notification MUST be sent using a null ("<>") + reverse path in the envelope; see Section 3.6 of RFC-821. The + recipient of this notification SHOULD be the address from the + envelope return path (or the Return-Path: line). However, if + this address is null ("<>"), the receiver-SMTP MUST NOT send a + notification. If the address is an explicit source route, it + SHOULD be stripped down to its final hop. + + DISCUSSION: + For example, suppose that an error notification must be + sent for a message that arrived with: + "MAIL FROM:<@a,@b:user@d>". The notification message + should be sent to: "RCPT TO:". + + Some delivery failures after the message is accepted by + SMTP will be unavoidable. For example, it may be + impossible for the receiver-SMTP to validate all the + delivery addresses in RCPT command(s) due to a "soft" + domain system error or because the target is a mailing + list (see earlier discussion of RCPT). + + To avoid receiving duplicate messages as the result of + timeouts, a receiver-SMTP MUST seek to minimize the time + required to respond to the final "." that ends a message + transfer. See RFC-1047 [SMTP:4] for a discussion of this + problem. + + 5.3.4 Reliable Mail Transmission + + To transmit a message, a sender-SMTP determines the IP address + of the target host from the destination address in the + envelope. Specifically, it maps the string to the right of the + + + +Internet Engineering Task Force [Page 63] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + "@" sign into an IP address. This mapping or the transfer + itself may fail with a soft error, in which case the sender- + SMTP will requeue the outgoing mail for a later retry, as + required in Section 5.3.1.1. + + When it succeeds, the mapping can result in a list of + alternative delivery addresses rather than a single address, + because of (a) multiple MX records, (b) multihoming, or both. + To provide reliable mail transmission, the sender-SMTP MUST be + able to try (and retry) each of the addresses in this list in + order, until a delivery attempt succeeds. However, there MAY + also be a configurable limit on the number of alternate + addresses that can be tried. In any case, a host SHOULD try at + least two addresses. + + The following information is to be used to rank the host + addresses: + + (1) Multiple MX Records -- these contain a preference + indication that should be used in sorting. If there are + multiple destinations with the same preference and there + is no clear reason to favor one (e.g., by address + preference), then the sender-SMTP SHOULD pick one at + random to spread the load across multiple mail exchanges + for a specific organization; note that this is a + refinement of the procedure in [DNS:3]. + + (2) Multihomed host -- The destination host (perhaps taken + from the preferred MX record) may be multihomed, in which + case the domain name resolver will return a list of + alternative IP addresses. It is the responsibility of the + domain name resolver interface (see Section 6.1.3.4 below) + to have ordered this list by decreasing preference, and + SMTP MUST try them in the order presented. + + DISCUSSION: + Although the capability to try multiple alternative + addresses is required, there may be circumstances where + specific installations want to limit or disable the use of + alternative addresses. The question of whether a sender + should attempt retries using the different addresses of a + multihomed host has been controversial. The main argument + for using the multiple addresses is that it maximizes the + probability of timely delivery, and indeed sometimes the + probability of any delivery; the counter argument is that + it may result in unnecessary resource use. + + Note that resource use is also strongly determined by the + + + +Internet Engineering Task Force [Page 64] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + sending strategy discussed in Section 5.3.1. + + 5.3.5 Domain Name Support + + SMTP implementations MUST use the mechanism defined in Section + 6.1 for mapping between domain names and IP addresses. This + means that every Internet SMTP MUST include support for the + Internet DNS. + + In particular, a sender-SMTP MUST support the MX record scheme + [SMTP:3]. See also Section 7.4 of [DNS:2] for information on + domain name support for SMTP. + + 5.3.6 Mailing Lists and Aliases + + An SMTP-capable host SHOULD support both the alias and the list + form of address expansion for multiple delivery. When a + message is delivered or forwarded to each address of an + expanded list form, the return address in the envelope + ("MAIL FROM:") MUST be changed to be the address of a person + who administers the list, but the message header MUST be left + unchanged; in particular, the "From" field of the message is + unaffected. + + DISCUSSION: + An important mail facility is a mechanism for multi- + destination delivery of a single message, by transforming + or "expanding" a pseudo-mailbox address into a list of + destination mailbox addresses. When a message is sent to + such a pseudo-mailbox (sometimes called an "exploder"), + copies are forwarded or redistributed to each mailbox in + the expanded list. We classify such a pseudo-mailbox as + an "alias" or a "list", depending upon the expansion + rules: + + (a) Alias + + To expand an alias, the recipient mailer simply + replaces the pseudo-mailbox address in the envelope + with each of the expanded addresses in turn; the rest + of the envelope and the message body are left + unchanged. The message is then delivered or + forwarded to each expanded address. + + (b) List + + A mailing list may be said to operate by + "redistribution" rather than by "forwarding". To + + + +Internet Engineering Task Force [Page 65] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + expand a list, the recipient mailer replaces the + pseudo-mailbox address in the envelope with each of + the expanded addresses in turn. The return address in + the envelope is changed so that all error messages + generated by the final deliveries will be returned to + a list administrator, not to the message originator, + who generally has no control over the contents of the + list and will typically find error messages annoying. + + + 5.3.7 Mail Gatewaying + + Gatewaying mail between different mail environments, i.e., + different mail formats and protocols, is complex and does not + easily yield to standardization. See for example [SMTP:5a], + [SMTP:5b]. However, some general requirements may be given for + a gateway between the Internet and another mail environment. + + (A) Header fields MAY be rewritten when necessary as messages + are gatewayed across mail environment boundaries. + + DISCUSSION: + This may involve interpreting the local-part of the + destination address, as suggested in Section 5.2.16. + + The other mail systems gatewayed to the Internet + generally use a subset of RFC-822 headers, but some + of them do not have an equivalent to the SMTP + envelope. Therefore, when a message leaves the + Internet environment, it may be necessary to fold the + SMTP envelope information into the message header. A + possible solution would be to create new header + fields to carry the envelope information (e.g., "X- + SMTP-MAIL:" and "X-SMTP-RCPT:"); however, this would + require changes in mail programs in the foreign + environment. + + (B) When forwarding a message into or out of the Internet + environment, a gateway MUST prepend a Received: line, but + it MUST NOT alter in any way a Received: line that is + already in the header. + + DISCUSSION: + This requirement is a subset of the general + "Received:" line requirement of Section 5.2.8; it is + restated here for emphasis. + + Received: fields of messages originating from other + + + +Internet Engineering Task Force [Page 66] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + environments may not conform exactly to RFC822. + However, the most important use of Received: lines is + for debugging mail faults, and this debugging can be + severely hampered by well-meaning gateways that try + to "fix" a Received: line. + + The gateway is strongly encouraged to indicate the + environment and protocol in the "via" clauses of + Received field(s) that it supplies. + + (C) From the Internet side, the gateway SHOULD accept all + valid address formats in SMTP commands and in RFC-822 + headers, and all valid RFC-822 messages. Although a + gateway must accept an RFC-822 explicit source route + ("@...:" format) in either the RFC-822 header or in the + envelope, it MAY or may not act on the source route; see + Sections 5.2.6 and 5.2.19. + + DISCUSSION: + It is often tempting to restrict the range of + addresses accepted at the mail gateway to simplify + the translation into addresses for the remote + environment. This practice is based on the + assumption that mail users have control over the + addresses their mailers send to the mail gateway. In + practice, however, users have little control over the + addresses that are finally sent; their mailers are + free to change addresses into any legal RFC-822 + format. + + (D) The gateway MUST ensure that all header fields of a + message that it forwards into the Internet meet the + requirements for Internet mail. In particular, all + addresses in "From:", "To:", "Cc:", etc., fields must be + transformed (if necessary) to satisfy RFC-822 syntax, and + they must be effective and useful for sending replies. + + + (E) The translation algorithm used to convert mail from the + Internet protocols to another environment's protocol + SHOULD try to ensure that error messages from the foreign + mail environment are delivered to the return path from the + SMTP envelope, not to the sender listed in the "From:" + field of the RFC-822 message. + + DISCUSSION: + Internet mail lists usually place the address of the + mail list maintainer in the envelope but leave the + + + +Internet Engineering Task Force [Page 67] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + original message header intact (with the "From:" + field containing the original sender). This yields + the behavior the average recipient expects: a reply + to the header gets sent to the original sender, not + to a mail list maintainer; however, errors get sent + to the maintainer (who can fix the problem) and not + the sender (who probably cannot). + + (F) Similarly, when forwarding a message from another + environment into the Internet, the gateway SHOULD set the + envelope return path in accordance with an error message + return address, if any, supplied by the foreign + environment. + + + 5.3.8 Maximum Message Size + + Mailer software MUST be able to send and receive messages of at + least 64K bytes in length (including header), and a much larger + maximum size is highly desirable. + + DISCUSSION: + Although SMTP does not define the maximum size of a + message, many systems impose implementation limits. + + The current de facto minimum limit in the Internet is 64K + bytes. However, electronic mail is used for a variety of + purposes that create much larger messages. For example, + mail is often used instead of FTP for transmitting ASCII + files, and in particular to transmit entire documents. As + a result, messages can be 1 megabyte or even larger. We + note that the present document together with its lower- + layer companion contains 0.5 megabytes. + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 68] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + 5.4 SMTP REQUIREMENTS SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-----------------------------------------------|----------|-|-|-|-|-|-- + | | | | | | | +RECEIVER-SMTP: | | | | | | | + Implement VRFY |5.2.3 |x| | | | | + Implement EXPN |5.2.3 | |x| | | | + EXPN, VRFY configurable |5.2.3 | | |x| | | + Implement SEND, SOML, SAML |5.2.4 | | |x| | | + Verify HELO parameter |5.2.5 | | |x| | | + Refuse message with bad HELO |5.2.5 | | | | |x| + Accept explicit src-route syntax in env. |5.2.6 |x| | | | | + Support "postmaster" |5.2.7 |x| | | | | + Process RCPT when received (except lists) |5.2.7 | | |x| | | + Long delay of RCPT responses |5.2.7 | | | | |x| + | | | | | | | + Add Received: line |5.2.8 |x| | | | | + Received: line include domain literal |5.2.8 | |x| | | | + Change previous Received: line |5.2.8 | | | | |x| + Pass Return-Path info (final deliv/gwy) |5.2.8 |x| | | | | + Support empty reverse path |5.2.9 |x| | | | | + Send only official reply codes |5.2.10 | |x| | | | + Send text from RFC-821 when appropriate |5.2.10 | |x| | | | + Delete "." for transparency |5.2.11 |x| | | | | + Accept and recognize self domain literal(s) |5.2.17 |x| | | | | + | | | | | | | + Error message about error message |5.3.1 | | | | |x| + Keep pending listen on SMTP port |5.3.1.2 | |x| | | | + Provide limit on recv concurrency |5.3.1.2 | | |x| | | + Wait at least 5 mins for next sender cmd |5.3.2 | |x| | | | + Avoidable delivery failure after "250 OK" |5.3.3 | | | | |x| + Send error notification msg after accept |5.3.3 |x| | | | | + Send using null return path |5.3.3 |x| | | | | + Send to envelope return path |5.3.3 | |x| | | | + Send to null address |5.3.3 | | | | |x| + Strip off explicit src route |5.3.3 | |x| | | | + Minimize acceptance delay (RFC-1047) |5.3.3 |x| | | | | +-----------------------------------------------|----------|-|-|-|-|-|-- + + + +Internet Engineering Task Force [Page 69] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + | | | | | | | +SENDER-SMTP: | | | | | | | + Canonicalized domain names in MAIL, RCPT |5.2.2 |x| | | | | + Implement SEND, SOML, SAML |5.2.4 | | |x| | | + Send valid principal host name in HELO |5.2.5 |x| | | | | + Send explicit source route in RCPT TO: |5.2.6 | | | |x| | + Use only reply code to determine action |5.2.10 |x| | | | | + Use only high digit of reply code when poss. |5.2.10 | |x| | | | + Add "." for transparency |5.2.11 |x| | | | | + | | | | | | | + Retry messages after soft failure |5.3.1.1 |x| | | | | + Delay before retry |5.3.1.1 |x| | | | | + Configurable retry parameters |5.3.1.1 |x| | | | | + Retry once per each queued dest host |5.3.1.1 | |x| | | | + Multiple RCPT's for same DATA |5.3.1.1 | |x| | | | + Support multiple concurrent transactions |5.3.1.1 | | |x| | | + Provide limit on concurrency |5.3.1.1 | |x| | | | + | | | | | | | + Timeouts on all activities |5.3.1 |x| | | | | + Per-command timeouts |5.3.2 | |x| | | | + Timeouts easily reconfigurable |5.3.2 | |x| | | | + Recommended times |5.3.2 | |x| | | | + Try alternate addr's in order |5.3.4 |x| | | | | + Configurable limit on alternate tries |5.3.4 | | |x| | | + Try at least two alternates |5.3.4 | |x| | | | + Load-split across equal MX alternates |5.3.4 | |x| | | | + Use the Domain Name System |5.3.5 |x| | | | | + Support MX records |5.3.5 |x| | | | | + Use WKS records in MX processing |5.2.12 | | | |x| | +-----------------------------------------------|----------|-|-|-|-|-|-- + | | | | | | | +MAIL FORWARDING: | | | | | | | + Alter existing header field(s) |5.2.6 | | | |x| | + Implement relay function: 821/section 3.6 |5.2.6 | | |x| | | + If not, deliver to RHS domain |5.2.6 | |x| | | | + Interpret 'local-part' of addr |5.2.16 | | | | |x| + | | | | | | | +MAILING LISTS AND ALIASES | | | | | | | + Support both |5.3.6 | |x| | | | + Report mail list error to local admin. |5.3.6 |x| | | | | + | | | | | | | +MAIL GATEWAYS: | | | | | | | + Embed foreign mail route in local-part |5.2.16 | | |x| | | + Rewrite header fields when necessary |5.3.7 | | |x| | | + Prepend Received: line |5.3.7 |x| | | | | + Change existing Received: line |5.3.7 | | | | |x| + Accept full RFC-822 on Internet side |5.3.7 | |x| | | | + Act on RFC-822 explicit source route |5.3.7 | | |x| | | + + + +Internet Engineering Task Force [Page 70] + + + + +RFC1123 MAIL -- SMTP & RFC-822 October 1989 + + + Send only valid RFC-822 on Internet side |5.3.7 |x| | | | | + Deliver error msgs to envelope addr |5.3.7 | |x| | | | + Set env return path from err return addr |5.3.7 | |x| | | | + | | | | | | | +USER AGENT -- RFC-822 | | | | | | | + Allow user to enter address |5.2.6 | | | |x| | + Support RFC-1049 Content Type field |5.2.13 | | |x| | | + Use 4-digit years |5.2.14 | |x| | | | + Generate numeric timezones |5.2.14 | |x| | | | + Accept all timezones |5.2.14 |x| | | | | + Use non-num timezones from RFC-822 |5.2.14 |x| | | | | + Omit phrase before route-addr |5.2.15 | | |x| | | + Accept and parse dot.dec. domain literals |5.2.17 |x| | | | | + Accept all RFC-822 address formats |5.2.18 |x| | | | | + Generate invalid RFC-822 address format |5.2.18 | | | | |x| + Fully-qualified domain names in header |5.2.18 |x| | | | | + Create explicit src route in header |5.2.19 | | | |x| | + Accept explicit src route in header |5.2.19 |x| | | | | + | | | | | | | +Send/recv at least 64KB messages |5.3.8 |x| | | | | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 71] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + +6. SUPPORT SERVICES + + 6.1 DOMAIN NAME TRANSLATION + + 6.1.1 INTRODUCTION + + Every host MUST implement a resolver for the Domain Name System + (DNS), and it MUST implement a mechanism using this DNS + resolver to convert host names to IP addresses and vice-versa + [DNS:1, DNS:2]. + + In addition to the DNS, a host MAY also implement a host name + translation mechanism that searches a local Internet host + table. See Section 6.1.3.8 for more information on this + option. + + DISCUSSION: + Internet host name translation was originally performed by + searching local copies of a table of all hosts. This + table became too large to update and distribute in a + timely manner and too large to fit into many hosts, so the + DNS was invented. + + The DNS creates a distributed database used primarily for + the translation between host names and host addresses. + Implementation of DNS software is required. The DNS + consists of two logically distinct parts: name servers and + resolvers (although implementations often combine these + two logical parts in the interest of efficiency) [DNS:2]. + + Domain name servers store authoritative data about certain + sections of the database and answer queries about the + data. Domain resolvers query domain name servers for data + on behalf of user processes. Every host therefore needs a + DNS resolver; some host machines will also need to run + domain name servers. Since no name server has complete + information, in general it is necessary to obtain + information from more than one name server to resolve a + query. + + 6.1.2 PROTOCOL WALK-THROUGH + + An implementor must study references [DNS:1] and [DNS:2] + carefully. They provide a thorough description of the theory, + protocol, and implementation of the domain name system, and + reflect several years of experience. + + + + + +Internet Engineering Task Force [Page 72] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + 6.1.2.1 Resource Records with Zero TTL: RFC-1035 Section 3.2.1 + + All DNS name servers and resolvers MUST properly handle RRs + with a zero TTL: return the RR to the client but do not + cache it. + + DISCUSSION: + Zero TTL values are interpreted to mean that the RR can + only be used for the transaction in progress, and + should not be cached; they are useful for extremely + volatile data. + + 6.1.2.2 QCLASS Values: RFC-1035 Section 3.2.5 + + A query with "QCLASS=*" SHOULD NOT be used unless the + requestor is seeking data from more than one class. In + particular, if the requestor is only interested in Internet + data types, QCLASS=IN MUST be used. + + 6.1.2.3 Unused Fields: RFC-1035 Section 4.1.1 + + Unused fields in a query or response message MUST be zero. + + 6.1.2.4 Compression: RFC-1035 Section 4.1.4 + + Name servers MUST use compression in responses. + + DISCUSSION: + Compression is essential to avoid overflowing UDP + datagrams; see Section 6.1.3.2. + + 6.1.2.5 Misusing Configuration Info: RFC-1035 Section 6.1.2 + + Recursive name servers and full-service resolvers generally + have some configuration information containing hints about + the location of root or local name servers. An + implementation MUST NOT include any of these hints in a + response. + + DISCUSSION: + Many implementors have found it convenient to store + these hints as if they were cached data, but some + neglected to ensure that this "cached data" was not + included in responses. This has caused serious + problems in the Internet when the hints were obsolete + or incorrect. + + + + + +Internet Engineering Task Force [Page 73] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + 6.1.3 SPECIFIC ISSUES + + 6.1.3.1 Resolver Implementation + + A name resolver SHOULD be able to multiplex concurrent + requests if the host supports concurrent processes. + + In implementing a DNS resolver, one of two different models + MAY optionally be chosen: a full-service resolver, or a stub + resolver. + + + (A) Full-Service Resolver + + A full-service resolver is a complete implementation of + the resolver service, and is capable of dealing with + communication failures, failure of individual name + servers, location of the proper name server for a given + name, etc. It must satisfy the following requirements: + + o The resolver MUST implement a local caching + function to avoid repeated remote access for + identical requests, and MUST time out information + in the cache. + + o The resolver SHOULD be configurable with start-up + information pointing to multiple root name servers + and multiple name servers for the local domain. + This insures that the resolver will be able to + access the whole name space in normal cases, and + will be able to access local domain information + should the local network become disconnected from + the rest of the Internet. + + + (B) Stub Resolver + + A "stub resolver" relies on the services of a recursive + name server on the connected network or a "nearby" + network. This scheme allows the host to pass on the + burden of the resolver function to a name server on + another host. This model is often essential for less + capable hosts, such as PCs, and is also recommended + when the host is one of several workstations on a local + network, because it allows all of the workstations to + share the cache of the recursive name server and hence + reduce the number of domain requests exported by the + local network. + + + +Internet Engineering Task Force [Page 74] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + At a minimum, the stub resolver MUST be capable of + directing its requests to redundant recursive name + servers. Note that recursive name servers are allowed + to restrict the sources of requests that they will + honor, so the host administrator must verify that the + service will be provided. Stub resolvers MAY implement + caching if they choose, but if so, MUST timeout cached + information. + + + 6.1.3.2 Transport Protocols + + DNS resolvers and recursive servers MUST support UDP, and + SHOULD support TCP, for sending (non-zone-transfer) queries. + Specifically, a DNS resolver or server that is sending a + non-zone-transfer query MUST send a UDP query first. If the + Answer section of the response is truncated and if the + requester supports TCP, it SHOULD try the query again using + TCP. + + DNS servers MUST be able to service UDP queries and SHOULD + be able to service TCP queries. A name server MAY limit the + resources it devotes to TCP queries, but it SHOULD NOT + refuse to service a TCP query just because it would have + succeeded with UDP. + + Truncated responses MUST NOT be saved (cached) and later + used in such a way that the fact that they are truncated is + lost. + + DISCUSSION: + UDP is preferred over TCP for queries because UDP + queries have much lower overhead, both in packet count + and in connection state. The use of UDP is essential + for heavily-loaded servers, especially the root + servers. UDP also offers additional robustness, since + a resolver can attempt several UDP queries to different + servers for the cost of a single TCP query. + + It is possible for a DNS response to be truncated, + although this is a very rare occurrence in the present + Internet DNS. Practically speaking, truncation cannot + be predicted, since it is data-dependent. The + dependencies include the number of RRs in the answer, + the size of each RR, and the savings in space realized + by the name compression algorithm. As a rule of thumb, + truncation in NS and MX lists should not occur for + answers containing 15 or fewer RRs. + + + +Internet Engineering Task Force [Page 75] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + Whether it is possible to use a truncated answer + depends on the application. A mailer must not use a + truncated MX response, since this could lead to mail + loops. + + Responsible practices can make UDP suffice in the vast + majority of cases. Name servers must use compression + in responses. Resolvers must differentiate truncation + of the Additional section of a response (which only + loses extra information) from truncation of the Answer + section (which for MX records renders the response + unusable by mailers). Database administrators should + list only a reasonable number of primary names in lists + of name servers, MX alternatives, etc. + + However, it is also clear that some new DNS record + types defined in the future will contain information + exceeding the 512 byte limit that applies to UDP, and + hence will require TCP. Thus, resolvers and name + servers should implement TCP services as a backup to + UDP today, with the knowledge that they will require + the TCP service in the future. + + By private agreement, name servers and resolvers MAY arrange + to use TCP for all traffic between themselves. TCP MUST be + used for zone transfers. + + A DNS server MUST have sufficient internal concurrency that + it can continue to process UDP queries while awaiting a + response or performing a zone transfer on an open TCP + connection [DNS:2]. + + A server MAY support a UDP query that is delivered using an + IP broadcast or multicast address. However, the Recursion + Desired bit MUST NOT be set in a query that is multicast, + and MUST be ignored by name servers receiving queries via a + broadcast or multicast address. A host that sends broadcast + or multicast DNS queries SHOULD send them only as occasional + probes, caching the IP address(es) it obtains from the + response(s) so it can normally send unicast queries. + + DISCUSSION: + Broadcast or (especially) IP multicast can provide a + way to locate nearby name servers without knowing their + IP addresses in advance. However, general broadcasting + of recursive queries can result in excessive and + unnecessary load on both network and servers. + + + + +Internet Engineering Task Force [Page 76] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + 6.1.3.3 Efficient Resource Usage + + The following requirements on servers and resolvers are very + important to the health of the Internet as a whole, + particularly when DNS services are invoked repeatedly by + higher level automatic servers, such as mailers. + + (1) The resolver MUST implement retransmission controls to + insure that it does not waste communication bandwidth, + and MUST impose finite bounds on the resources consumed + to respond to a single request. See [DNS:2] pages 43- + 44 for specific recommendations. + + (2) After a query has been retransmitted several times + without a response, an implementation MUST give up and + return a soft error to the application. + + (3) All DNS name servers and resolvers SHOULD cache + temporary failures, with a timeout period of the order + of minutes. + + DISCUSSION: + This will prevent applications that immediately + retry soft failures (in violation of Section 2.2 + of this document) from generating excessive DNS + traffic. + + (4) All DNS name servers and resolvers SHOULD cache + negative responses that indicate the specified name, or + data of the specified type, does not exist, as + described in [DNS:2]. + + (5) When a DNS server or resolver retries a UDP query, the + retry interval SHOULD be constrained by an exponential + backoff algorithm, and SHOULD also have upper and lower + bounds. + + IMPLEMENTATION: + A measured RTT and variance (if available) should + be used to calculate an initial retransmission + interval. If this information is not available, a + default of no less than 5 seconds should be used. + Implementations may limit the retransmission + interval, but this limit must exceed twice the + Internet maximum segment lifetime plus service + delay at the name server. + + (6) When a resolver or server receives a Source Quench for + + + +Internet Engineering Task Force [Page 77] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + a query it has issued, it SHOULD take steps to reduce + the rate of querying that server in the near future. A + server MAY ignore a Source Quench that it receives as + the result of sending a response datagram. + + IMPLEMENTATION: + One recommended action to reduce the rate is to + send the next query attempt to an alternate + server, if there is one available. Another is to + backoff the retry interval for the same server. + + + 6.1.3.4 Multihomed Hosts + + When the host name-to-address function encounters a host + with multiple addresses, it SHOULD rank or sort the + addresses using knowledge of the immediately connected + network number(s) and any other applicable performance or + history information. + + DISCUSSION: + The different addresses of a multihomed host generally + imply different Internet paths, and some paths may be + preferable to others in performance, reliability, or + administrative restrictions. There is no general way + for the domain system to determine the best path. A + recommended approach is to base this decision on local + configuration information set by the system + administrator. + + IMPLEMENTATION: + The following scheme has been used successfully: + + (a) Incorporate into the host configuration data a + Network-Preference List, that is simply a list of + networks in preferred order. This list may be + empty if there is no preference. + + (b) When a host name is mapped into a list of IP + addresses, these addresses should be sorted by + network number, into the same order as the + corresponding networks in the Network-Preference + List. IP addresses whose networks do not appear + in the Network-Preference List should be placed at + the end of the list. + + + + + + +Internet Engineering Task Force [Page 78] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + 6.1.3.5 Extensibility + + DNS software MUST support all well-known, class-independent + formats [DNS:2], and SHOULD be written to minimize the + trauma associated with the introduction of new well-known + types and local experimentation with non-standard types. + + DISCUSSION: + The data types and classes used by the DNS are + extensible, and thus new types will be added and old + types deleted or redefined. Introduction of new data + types ought to be dependent only upon the rules for + compression of domain names inside DNS messages, and + the translation between printable (i.e., master file) + and internal formats for Resource Records (RRs). + + Compression relies on knowledge of the format of data + inside a particular RR. Hence compression must only be + used for the contents of well-known, class-independent + RRs, and must never be used for class-specific RRs or + RR types that are not well-known. The owner name of an + RR is always eligible for compression. + + A name server may acquire, via zone transfer, RRs that + the server doesn't know how to convert to printable + format. A resolver can receive similar information as + the result of queries. For proper operation, this data + must be preserved, and hence the implication is that + DNS software cannot use textual formats for internal + storage. + + The DNS defines domain name syntax very generally -- a + string of labels each containing up to 63 8-bit octets, + separated by dots, and with a maximum total of 255 + octets. Particular applications of the DNS are + permitted to further constrain the syntax of the domain + names they use, although the DNS deployment has led to + some applications allowing more general names. In + particular, Section 2.1 of this document liberalizes + slightly the syntax of a legal Internet host name that + was defined in RFC-952 [DNS:4]. + + 6.1.3.6 Status of RR Types + + Name servers MUST be able to load all RR types except MD and + MF from configuration files. The MD and MF types are + obsolete and MUST NOT be implemented; in particular, name + servers MUST NOT load these types from configuration files. + + + +Internet Engineering Task Force [Page 79] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + DISCUSSION: + The RR types MB, MG, MR, NULL, MINFO and RP are + considered experimental, and applications that use the + DNS cannot expect these RR types to be supported by + most domains. Furthermore these types are subject to + redefinition. + + The TXT and WKS RR types have not been widely used by + Internet sites; as a result, an application cannot rely + on the the existence of a TXT or WKS RR in most + domains. + + 6.1.3.7 Robustness + + DNS software may need to operate in environments where the + root servers or other servers are unavailable due to network + connectivity or other problems. In this situation, DNS name + servers and resolvers MUST continue to provide service for + the reachable part of the name space, while giving temporary + failures for the rest. + + DISCUSSION: + Although the DNS is meant to be used primarily in the + connected Internet, it should be possible to use the + system in networks which are unconnected to the + Internet. Hence implementations must not depend on + access to root servers before providing service for + local names. + + 6.1.3.8 Local Host Table + + DISCUSSION: + A host may use a local host table as a backup or + supplement to the DNS. This raises the question of + which takes precedence, the DNS or the host table; the + most flexible approach would make this a configuration + option. + + Typically, the contents of such a supplementary host + table will be determined locally by the site. However, + a publically-available table of Internet hosts is + maintained by the DDN Network Information Center (DDN + NIC), with a format documented in [DNS:4]. This table + can be retrieved from the DDN NIC using a protocol + described in [DNS:5]. It must be noted that this table + contains only a small fraction of all Internet hosts. + Hosts using this protocol to retrieve the DDN NIC host + table should use the VERSION command to check if the + + + +Internet Engineering Task Force [Page 80] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + table has changed before requesting the entire table + with the ALL command. The VERSION identifier should be + treated as an arbitrary string and tested only for + equality; no numerical sequence may be assumed. + + The DDN NIC host table includes administrative + information that is not needed for host operation and + is therefore not currently included in the DNS + database; examples include network and gateway entries. + However, much of this additional information will be + added to the DNS in the future. Conversely, the DNS + provides essential services (in particular, MX records) + that are not available from the DDN NIC host table. + + 6.1.4 DNS USER INTERFACE + + 6.1.4.1 DNS Administration + + This document is concerned with design and implementation + issues in host software, not with administrative or + operational issues. However, administrative issues are of + particular importance in the DNS, since errors in particular + segments of this large distributed database can cause poor + or erroneous performance for many sites. These issues are + discussed in [DNS:6] and [DNS:7]. + + 6.1.4.2 DNS User Interface + + Hosts MUST provide an interface to the DNS for all + application programs running on the host. This interface + will typically direct requests to a system process to + perform the resolver function [DNS:1, 6.1:2]. + + At a minimum, the basic interface MUST support a request for + all information of a specific type and class associated with + a specific name, and it MUST return either all of the + requested information, a hard error code, or a soft error + indication. When there is no error, the basic interface + returns the complete response information without + modification, deletion, or ordering, so that the basic + interface will not need to be changed to accommodate new + data types. + + DISCUSSION: + The soft error indication is an essential part of the + interface, since it may not always be possible to + access particular information from the DNS; see Section + 6.1.3.3. + + + +Internet Engineering Task Force [Page 81] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + A host MAY provide other DNS interfaces tailored to + particular functions, transforming the raw domain data into + formats more suited to these functions. In particular, a + host MUST provide a DNS interface to facilitate translation + between host addresses and host names. + + 6.1.4.3 Interface Abbreviation Facilities + + User interfaces MAY provide a method for users to enter + abbreviations for commonly-used names. Although the + definition of such methods is outside of the scope of the + DNS specification, certain rules are necessary to insure + that these methods allow access to the entire DNS name space + and to prevent excessive use of Internet resources. + + If an abbreviation method is provided, then: + + (a) There MUST be some convention for denoting that a name + is already complete, so that the abbreviation method(s) + are suppressed. A trailing dot is the usual method. + + (b) Abbreviation expansion MUST be done exactly once, and + MUST be done in the context in which the name was + entered. + + + DISCUSSION: + For example, if an abbreviation is used in a mail + program for a destination, the abbreviation should be + expanded into a full domain name and stored in the + queued message with an indication that it is already + complete. Otherwise, the abbreviation might be + expanded with a mail system search list, not the + user's, or a name could grow due to repeated + canonicalizations attempts interacting with wildcards. + + The two most common abbreviation methods are: + + (1) Interface-level aliases + + Interface-level aliases are conceptually implemented as + a list of alias/domain name pairs. The list can be + per-user or per-host, and separate lists can be + associated with different functions, e.g. one list for + host name-to-address translation, and a different list + for mail domains. When the user enters a name, the + interface attempts to match the name to the alias + component of a list entry, and if a matching entry can + + + +Internet Engineering Task Force [Page 82] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + be found, the name is replaced by the domain name found + in the pair. + + Note that interface-level aliases and CNAMEs are + completely separate mechanisms; interface-level aliases + are a local matter while CNAMEs are an Internet-wide + aliasing mechanism which is a required part of any DNS + implementation. + + (2) Search Lists + + A search list is conceptually implemented as an ordered + list of domain names. When the user enters a name, the + domain names in the search list are used as suffixes to + the user-supplied name, one by one, until a domain name + with the desired associated data is found, or the + search list is exhausted. Search lists often contain + the name of the local host's parent domain or other + ancestor domains. Search lists are often per-user or + per-process. + + It SHOULD be possible for an administrator to disable a + DNS search-list facility. Administrative denial may be + warranted in some cases, to prevent abuse of the DNS. + + There is danger that a search-list mechanism will + generate excessive queries to the root servers while + testing whether user input is a complete domain name, + lacking a final period to mark it as complete. A + search-list mechanism MUST have one of, and SHOULD have + both of, the following two provisions to prevent this: + + (a) The local resolver/name server can implement + caching of negative responses (see Section + 6.1.3.3). + + (b) The search list expander can require two or more + interior dots in a generated domain name before it + tries using the name in a query to non-local + domain servers, such as the root. + + DISCUSSION: + The intent of this requirement is to avoid + excessive delay for the user as the search list is + tested, and more importantly to prevent excessive + traffic to the root and other high-level servers. + For example, if the user supplied a name "X" and + the search list contained the root as a component, + + + +Internet Engineering Task Force [Page 83] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + a query would have to consult a root server before + the next search list alternative could be tried. + The resulting load seen by the root servers and + gateways near the root would be multiplied by the + number of hosts in the Internet. + + The negative caching alternative limits the effect + to the first time a name is used. The interior + dot rule is simpler to implement but can prevent + easy use of some top-level names. + + + 6.1.5 DOMAIN NAME SYSTEM REQUIREMENTS SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-----------------------------------------------|-----------|-|-|-|-|-|-- +GENERAL ISSUES | | | | | | | + | | | | | | | +Implement DNS name-to-address conversion |6.1.1 |x| | | | | +Implement DNS address-to-name conversion |6.1.1 |x| | | | | +Support conversions using host table |6.1.1 | | |x| | | +Properly handle RR with zero TTL |6.1.2.1 |x| | | | | +Use QCLASS=* unnecessarily |6.1.2.2 | |x| | | | + Use QCLASS=IN for Internet class |6.1.2.2 |x| | | | | +Unused fields zero |6.1.2.3 |x| | | | | +Use compression in responses |6.1.2.4 |x| | | | | + | | | | | | | +Include config info in responses |6.1.2.5 | | | | |x| +Support all well-known, class-indep. types |6.1.3.5 |x| | | | | +Easily expand type list |6.1.3.5 | |x| | | | +Load all RR types (except MD and MF) |6.1.3.6 |x| | | | | +Load MD or MF type |6.1.3.6 | | | | |x| +Operate when root servers, etc. unavailable |6.1.3.7 |x| | | | | +-----------------------------------------------|-----------|-|-|-|-|-|-- +RESOLVER ISSUES: | | | | | | | + | | | | | | | +Resolver support multiple concurrent requests |6.1.3.1 | |x| | | | +Full-service resolver: |6.1.3.1 | | |x| | | + Local caching |6.1.3.1 |x| | | | | + + + +Internet Engineering Task Force [Page 84] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + Information in local cache times out |6.1.3.1 |x| | | | | + Configurable with starting info |6.1.3.1 | |x| | | | +Stub resolver: |6.1.3.1 | | |x| | | + Use redundant recursive name servers |6.1.3.1 |x| | | | | + Local caching |6.1.3.1 | | |x| | | + Information in local cache times out |6.1.3.1 |x| | | | | +Support for remote multi-homed hosts: | | | | | | | + Sort multiple addresses by preference list |6.1.3.4 | |x| | | | + | | | | | | | +-----------------------------------------------|-----------|-|-|-|-|-|-- +TRANSPORT PROTOCOLS: | | | | | | | + | | | | | | | +Support UDP queries |6.1.3.2 |x| | | | | +Support TCP queries |6.1.3.2 | |x| | | | + Send query using UDP first |6.1.3.2 |x| | | | |1 + Try TCP if UDP answers are truncated |6.1.3.2 | |x| | | | +Name server limit TCP query resources |6.1.3.2 | | |x| | | + Punish unnecessary TCP query |6.1.3.2 | | | |x| | +Use truncated data as if it were not |6.1.3.2 | | | | |x| +Private agreement to use only TCP |6.1.3.2 | | |x| | | +Use TCP for zone transfers |6.1.3.2 |x| | | | | +TCP usage not block UDP queries |6.1.3.2 |x| | | | | +Support broadcast or multicast queries |6.1.3.2 | | |x| | | + RD bit set in query |6.1.3.2 | | | | |x| + RD bit ignored by server is b'cast/m'cast |6.1.3.2 |x| | | | | + Send only as occasional probe for addr's |6.1.3.2 | |x| | | | +-----------------------------------------------|-----------|-|-|-|-|-|-- +RESOURCE USAGE: | | | | | | | + | | | | | | | +Transmission controls, per [DNS:2] |6.1.3.3 |x| | | | | + Finite bounds per request |6.1.3.3 |x| | | | | +Failure after retries => soft error |6.1.3.3 |x| | | | | +Cache temporary failures |6.1.3.3 | |x| | | | +Cache negative responses |6.1.3.3 | |x| | | | +Retries use exponential backoff |6.1.3.3 | |x| | | | + Upper, lower bounds |6.1.3.3 | |x| | | | +Client handle Source Quench |6.1.3.3 | |x| | | | +Server ignore Source Quench |6.1.3.3 | | |x| | | +-----------------------------------------------|-----------|-|-|-|-|-|-- +USER INTERFACE: | | | | | | | + | | | | | | | +All programs have access to DNS interface |6.1.4.2 |x| | | | | +Able to request all info for given name |6.1.4.2 |x| | | | | +Returns complete info or error |6.1.4.2 |x| | | | | +Special interfaces |6.1.4.2 | | |x| | | + Name<->Address translation |6.1.4.2 |x| | | | | + | | | | | | | +Abbreviation Facilities: |6.1.4.3 | | |x| | | + + + +Internet Engineering Task Force [Page 85] + + + + +RFC1123 SUPPORT SERVICES -- DOMAINS October 1989 + + + Convention for complete names |6.1.4.3 |x| | | | | + Conversion exactly once |6.1.4.3 |x| | | | | + Conversion in proper context |6.1.4.3 |x| | | | | + Search list: |6.1.4.3 | | |x| | | + Administrator can disable |6.1.4.3 | |x| | | | + Prevention of excessive root queries |6.1.4.3 |x| | | | | + Both methods |6.1.4.3 | |x| | | | +-----------------------------------------------|-----------|-|-|-|-|-|-- +-----------------------------------------------|-----------|-|-|-|-|-|-- + +1. Unless there is private agreement between particular resolver and + particular server. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 86] + + + + +RFC1123 SUPPORT SERVICES -- INITIALIZATION October 1989 + + + 6.2 HOST INITIALIZATION + + 6.2.1 INTRODUCTION + + This section discusses the initialization of host software + across a connected network, or more generally across an + Internet path. This is necessary for a diskless host, and may + optionally be used for a host with disk drives. For a diskless + host, the initialization process is called "network booting" + and is controlled by a bootstrap program located in a boot ROM. + + To initialize a diskless host across the network, there are two + distinct phases: + + (1) Configure the IP layer. + + Diskless machines often have no permanent storage in which + to store network configuration information, so that + sufficient configuration information must be obtained + dynamically to support the loading phase that follows. + This information must include at least the IP addresses of + the host and of the boot server. To support booting + across a gateway, the address mask and a list of default + gateways are also required. + + (2) Load the host system code. + + During the loading phase, an appropriate file transfer + protocol is used to copy the system code across the + network from the boot server. + + A host with a disk may perform the first step, dynamic + configuration. This is important for microcomputers, whose + floppy disks allow network configuration information to be + mistakenly duplicated on more than one host. Also, + installation of new hosts is much simpler if they automatically + obtain their configuration information from a central server, + saving administrator time and decreasing the probability of + mistakes. + + 6.2.2 REQUIREMENTS + + 6.2.2.1 Dynamic Configuration + + A number of protocol provisions have been made for dynamic + configuration. + + o ICMP Information Request/Reply messages + + + +Internet Engineering Task Force [Page 87] + + + + +RFC1123 SUPPORT SERVICES -- INITIALIZATION October 1989 + + + This obsolete message pair was designed to allow a host + to find the number of the network it is on. + Unfortunately, it was useful only if the host already + knew the host number part of its IP address, + information that hosts requiring dynamic configuration + seldom had. + + o Reverse Address Resolution Protocol (RARP) [BOOT:4] + + RARP is a link-layer protocol for a broadcast medium + that allows a host to find its IP address given its + link layer address. Unfortunately, RARP does not work + across IP gateways and therefore requires a RARP server + on every network. In addition, RARP does not provide + any other configuration information. + + o ICMP Address Mask Request/Reply messages + + These ICMP messages allow a host to learn the address + mask for a particular network interface. + + o BOOTP Protocol [BOOT:2] + + This protocol allows a host to determine the IP + addresses of the local host and the boot server, the + name of an appropriate boot file, and optionally the + address mask and list of default gateways. To locate a + BOOTP server, the host broadcasts a BOOTP request using + UDP. Ad hoc gateway extensions have been used to + transmit the BOOTP broadcast through gateways, and in + the future the IP Multicasting facility will provide a + standard mechanism for this purpose. + + + The suggested approach to dynamic configuration is to use + the BOOTP protocol with the extensions defined in "BOOTP + Vendor Information Extensions" RFC-1084 [BOOT:3]. RFC-1084 + defines some important general (not vendor-specific) + extensions. In particular, these extensions allow the + address mask to be supplied in BOOTP; we RECOMMEND that the + address mask be supplied in this manner. + + DISCUSSION: + Historically, subnetting was defined long after IP, and + so a separate mechanism (ICMP Address Mask messages) + was designed to supply the address mask to a host. + However, the IP address mask and the corresponding IP + address conceptually form a pair, and for operational + + + +Internet Engineering Task Force [Page 88] + + + + +RFC1123 SUPPORT SERVICES -- INITIALIZATION October 1989 + + + simplicity they ought to be defined at the same time + and by the same mechanism, whether a configuration file + or a dynamic mechanism like BOOTP. + + Note that BOOTP is not sufficiently general to specify + the configurations of all interfaces of a multihomed + host. A multihomed host must either use BOOTP + separately for each interface, or configure one + interface using BOOTP to perform the loading, and + perform the complete initialization from a file later. + + Application layer configuration information is expected + to be obtained from files after loading of the system + code. + + 6.2.2.2 Loading Phase + + A suggested approach for the loading phase is to use TFTP + [BOOT:1] between the IP addresses established by BOOTP. + + TFTP to a broadcast address SHOULD NOT be used, for reasons + explained in Section 4.2.3.4. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 89] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + + 6.3 REMOTE MANAGEMENT + + 6.3.1 INTRODUCTION + + The Internet community has recently put considerable effort + into the development of network management protocols. The + result has been a two-pronged approach [MGT:1, MGT:6]: the + Simple Network Management Protocol (SNMP) [MGT:4] and the + Common Management Information Protocol over TCP (CMOT) [MGT:5]. + + In order to be managed using SNMP or CMOT, a host will need to + implement an appropriate management agent. An Internet host + SHOULD include an agent for either SNMP or CMOT. + + Both SNMP and CMOT operate on a Management Information Base + (MIB) that defines a collection of management values. By + reading and setting these values, a remote application may + query and change the state of the managed system. + + A standard MIB [MGT:3] has been defined for use by both + management protocols, using data types defined by the Structure + of Management Information (SMI) defined in [MGT:2]. Additional + MIB variables can be introduced under the "enterprises" and + "experimental" subtrees of the MIB naming space [MGT:2]. + + Every protocol module in the host SHOULD implement the relevant + MIB variables. A host SHOULD implement the MIB variables as + defined in the most recent standard MIB, and MAY implement + other MIB variables when appropriate and useful. + + 6.3.2 PROTOCOL WALK-THROUGH + + The MIB is intended to cover both hosts and gateways, although + there may be detailed differences in MIB application to the two + cases. This section contains the appropriate interpretation of + the MIB for hosts. It is likely that later versions of the MIB + will include more entries for host management. + + A managed host must implement the following groups of MIB + object definitions: System, Interfaces, Address Translation, + IP, ICMP, TCP, and UDP. + + The following specific interpretations apply to hosts: + + o ipInHdrErrors + + Note that the error "time-to-live exceeded" can occur in a + host only when it is forwarding a source-routed datagram. + + + +Internet Engineering Task Force [Page 90] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + + o ipOutNoRoutes + + This object counts datagrams discarded because no route + can be found. This may happen in a host if all the + default gateways in the host's configuration are down. + + o ipFragOKs, ipFragFails, ipFragCreates + + A host that does not implement intentional fragmentation + (see "Fragmentation" section of [INTRO:1]) MUST return the + value zero for these three objects. + + o icmpOutRedirects + + For a host, this object MUST always be zero, since hosts + do not send Redirects. + + o icmpOutAddrMaskReps + + For a host, this object MUST always be zero, unless the + host is an authoritative source of address mask + information. + + o ipAddrTable + + For a host, the "IP Address Table" object is effectively a + table of logical interfaces. + + o ipRoutingTable + + For a host, the "IP Routing Table" object is effectively a + combination of the host's Routing Cache and the static + route table described in "Routing Outbound Datagrams" + section of [INTRO:1]. + + Within each ipRouteEntry, ipRouteMetric1...4 normally will + have no meaning for a host and SHOULD always be -1, while + ipRouteType will normally have the value "remote". + + If destinations on the connected network do not appear in + the Route Cache (see "Routing Outbound Datagrams section + of [INTRO:1]), there will be no entries with ipRouteType + of "direct". + + + DISCUSSION: + The current MIB does not include Type-of-Service in an + ipRouteEntry, but a future revision is expected to make + + + +Internet Engineering Task Force [Page 91] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + + this addition. + + We also expect the MIB to be expanded to allow the remote + management of applications (e.g., the ability to partially + reconfigure mail systems). Network service applications + such as mail systems should therefore be written with the + "hooks" for remote management. + + 6.3.3 MANAGEMENT REQUIREMENTS SUMMARY + + | | | | |S| | + | | | | |H| |F + | | | | |O|M|o + | | |S| |U|U|o + | | |H| |L|S|t + | |M|O| |D|T|n + | |U|U|M| | |o + | |S|L|A|N|N|t + | |T|D|Y|O|O|t +FEATURE |SECTION | | | |T|T|e +-----------------------------------------------|-----------|-|-|-|-|-|-- +Support SNMP or CMOT agent |6.3.1 | |x| | | | +Implement specified objects in standard MIB |6.3.1 | |x| | | | + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 92] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + +7. REFERENCES + + This section lists the primary references with which every + implementer must be thoroughly familiar. It also lists some + secondary references that are suggested additional reading. + + INTRODUCTORY REFERENCES: + + + [INTRO:1] "Requirements for Internet Hosts -- Communication Layers," + IETF Host Requirements Working Group, R. Braden, Ed., RFC-1122, + October 1989. + + [INTRO:2] "DDN Protocol Handbook," NIC-50004, NIC-50005, NIC-50006, + (three volumes), SRI International, December 1985. + + [INTRO:3] "Official Internet Protocols," J. Reynolds and J. Postel, + RFC-1011, May 1987. + + This document is republished periodically with new RFC numbers; + the latest version must be used. + + [INTRO:4] "Protocol Document Order Information," O. Jacobsen and J. + Postel, RFC-980, March 1986. + + [INTRO:5] "Assigned Numbers," J. Reynolds and J. Postel, RFC-1010, + May 1987. + + This document is republished periodically with new RFC numbers; + the latest version must be used. + + + TELNET REFERENCES: + + + [TELNET:1] "Telnet Protocol Specification," J. Postel and J. + Reynolds, RFC-854, May 1983. + + [TELNET:2] "Telnet Option Specification," J. Postel and J. Reynolds, + RFC-855, May 1983. + + [TELNET:3] "Telnet Binary Transmission," J. Postel and J. Reynolds, + RFC-856, May 1983. + + [TELNET:4] "Telnet Echo Option," J. Postel and J. Reynolds, RFC-857, + May 1983. + + [TELNET:5] "Telnet Suppress Go Ahead Option," J. Postel and J. + + + +Internet Engineering Task Force [Page 93] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + + Reynolds, RFC-858, May 1983. + + [TELNET:6] "Telnet Status Option," J. Postel and J. Reynolds, RFC- + 859, May 1983. + + [TELNET:7] "Telnet Timing Mark Option," J. Postel and J. Reynolds, + RFC-860, May 1983. + + [TELNET:8] "Telnet Extended Options List," J. Postel and J. + Reynolds, RFC-861, May 1983. + + [TELNET:9] "Telnet End-Of-Record Option," J. Postel, RFC-855, + December 1983. + + [TELNET:10] "Telnet Terminal-Type Option," J. VanBokkelen, RFC-1091, + February 1989. + + This document supercedes RFC-930. + + [TELNET:11] "Telnet Window Size Option," D. Waitzman, RFC-1073, + October 1988. + + [TELNET:12] "Telnet Linemode Option," D. Borman, RFC-1116, August + 1989. + + [TELNET:13] "Telnet Terminal Speed Option," C. Hedrick, RFC-1079, + December 1988. + + [TELNET:14] "Telnet Remote Flow Control Option," C. Hedrick, RFC- + 1080, November 1988. + + + SECONDARY TELNET REFERENCES: + + + [TELNET:15] "Telnet Protocol," MIL-STD-1782, U.S. Department of + Defense, May 1984. + + This document is intended to describe the same protocol as RFC- + 854. In case of conflict, RFC-854 takes precedence, and the + present document takes precedence over both. + + [TELNET:16] "SUPDUP Protocol," M. Crispin, RFC-734, October 1977. + + [TELNET:17] "Telnet SUPDUP Option," M. Crispin, RFC-736, October + 1977. + + [TELNET:18] "Data Entry Terminal Option," J. Day, RFC-732, June 1977. + + + +Internet Engineering Task Force [Page 94] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + + [TELNET:19] "TELNET Data Entry Terminal option -- DODIIS + Implementation," A. Yasuda and T. Thompson, RFC-1043, February + 1988. + + + FTP REFERENCES: + + + [FTP:1] "File Transfer Protocol," J. Postel and J. Reynolds, RFC- + 959, October 1985. + + [FTP:2] "Document File Format Standards," J. Postel, RFC-678, + December 1974. + + [FTP:3] "File Transfer Protocol," MIL-STD-1780, U.S. Department of + Defense, May 1984. + + This document is based on an earlier version of the FTP + specification (RFC-765) and is obsolete. + + + TFTP REFERENCES: + + + [TFTP:1] "The TFTP Protocol Revision 2," K. Sollins, RFC-783, June + 1981. + + + MAIL REFERENCES: + + + [SMTP:1] "Simple Mail Transfer Protocol," J. Postel, RFC-821, August + 1982. + + [SMTP:2] "Standard For The Format of ARPA Internet Text Messages," + D. Crocker, RFC-822, August 1982. + + This document obsoleted an earlier specification, RFC-733. + + [SMTP:3] "Mail Routing and the Domain System," C. Partridge, RFC- + 974, January 1986. + + This RFC describes the use of MX records, a mandatory extension + to the mail delivery process. + + [SMTP:4] "Duplicate Messages and SMTP," C. Partridge, RFC-1047, + February 1988. + + + + +Internet Engineering Task Force [Page 95] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + + [SMTP:5a] "Mapping between X.400 and RFC 822," S. Kille, RFC-987, + June 1986. + + [SMTP:5b] "Addendum to RFC-987," S. Kille, RFC-???, September 1987. + + The two preceding RFC's define a proposed standard for + gatewaying mail between the Internet and the X.400 environments. + + [SMTP:6] "Simple Mail Transfer Protocol," MIL-STD-1781, U.S. + Department of Defense, May 1984. + + This specification is intended to describe the same protocol as + does RFC-821. However, MIL-STD-1781 is incomplete; in + particular, it does not include MX records [SMTP:3]. + + [SMTP:7] "A Content-Type Field for Internet Messages," M. Sirbu, + RFC-1049, March 1988. + + + DOMAIN NAME SYSTEM REFERENCES: + + + [DNS:1] "Domain Names - Concepts and Facilities," P. Mockapetris, + RFC-1034, November 1987. + + This document and the following one obsolete RFC-882, RFC-883, + and RFC-973. + + [DNS:2] "Domain Names - Implementation and Specification," RFC-1035, + P. Mockapetris, November 1987. + + + [DNS:3] "Mail Routing and the Domain System," C. Partridge, RFC-974, + January 1986. + + + [DNS:4] "DoD Internet Host Table Specification," K. Harrenstein, + RFC-952, M. Stahl, E. Feinler, October 1985. + + SECONDARY DNS REFERENCES: + + + [DNS:5] "Hostname Server," K. Harrenstein, M. Stahl, E. Feinler, + RFC-953, October 1985. + + [DNS:6] "Domain Administrators Guide," M. Stahl, RFC-1032, November + 1987. + + + + +Internet Engineering Task Force [Page 96] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + + [DNS:7] "Domain Administrators Operations Guide," M. Lottor, RFC- + 1033, November 1987. + + [DNS:8] "The Domain Name System Handbook," Vol. 4 of Internet + Protocol Handbook, NIC 50007, SRI Network Information Center, + August 1989. + + + SYSTEM INITIALIZATION REFERENCES: + + + [BOOT:1] "Bootstrap Loading Using TFTP," R. Finlayson, RFC-906, June + 1984. + + [BOOT:2] "Bootstrap Protocol (BOOTP)," W. Croft and J. Gilmore, RFC- + 951, September 1985. + + [BOOT:3] "BOOTP Vendor Information Extensions," J. Reynolds, RFC- + 1084, December 1988. + + Note: this RFC revised and obsoleted RFC-1048. + + [BOOT:4] "A Reverse Address Resolution Protocol," R. Finlayson, T. + Mann, J. Mogul, and M. Theimer, RFC-903, June 1984. + + + MANAGEMENT REFERENCES: + + + [MGT:1] "IAB Recommendations for the Development of Internet Network + Management Standards," V. Cerf, RFC-1052, April 1988. + + [MGT:2] "Structure and Identification of Management Information for + TCP/IP-based internets," M. Rose and K. McCloghrie, RFC-1065, + August 1988. + + [MGT:3] "Management Information Base for Network Management of + TCP/IP-based internets," M. Rose and K. McCloghrie, RFC-1066, + August 1988. + + [MGT:4] "A Simple Network Management Protocol," J. Case, M. Fedor, + M. Schoffstall, and C. Davin, RFC-1098, April 1989. + + [MGT:5] "The Common Management Information Services and Protocol + over TCP/IP," U. Warrier and L. Besaw, RFC-1095, April 1989. + + [MGT:6] "Report of the Second Ad Hoc Network Management Review + Group," V. Cerf, RFC-1109, August 1989. + + + +Internet Engineering Task Force [Page 97] + + + + +RFC1123 SUPPORT SERVICES -- MANAGEMENT October 1989 + + +Security Considerations + + There are many security issues in the application and support + programs of host software, but a full discussion is beyond the scope + of this RFC. Security-related issues are mentioned in sections + concerning TFTP (Sections 4.2.1, 4.2.3.4, 4.2.3.5), the SMTP VRFY and + EXPN commands (Section 5.2.3), the SMTP HELO command (5.2.5), and the + SMTP DATA command (Section 5.2.8). + +Author's Address + + Robert Braden + USC/Information Sciences Institute + 4676 Admiralty Way + Marina del Rey, CA 90292-6695 + + Phone: (213) 822 1511 + + EMail: Braden@ISI.EDU + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Internet Engineering Task Force [Page 98] + diff --git a/ext/picotcp/RFC/rfc1323.txt b/ext/picotcp/RFC/rfc1323.txt new file mode 100644 index 0000000..356eaa8 --- /dev/null +++ b/ext/picotcp/RFC/rfc1323.txt @@ -0,0 +1,2075 @@ + + + + + + +Network Working Group V. Jacobson +Request for Comments: 1323 LBL +Obsoletes: RFC 1072, RFC 1185 R. Braden + ISI + D. Borman + Cray Research + May 1992 + + + TCP Extensions for High Performance + +Status of This Memo + + This RFC specifies an IAB standards track protocol for the Internet + community, and requests discussion and suggestions for improvements. + Please refer to the current edition of the "IAB Official Protocol + Standards" for the standardization state and status of this protocol. + Distribution of this memo is unlimited. + +Abstract + + This memo presents a set of TCP extensions to improve performance + over large bandwidth*delay product paths and to provide reliable + operation over very high-speed paths. It defines new TCP options for + scaled windows and timestamps, which are designed to provide + compatible interworking with TCP's that do not implement the + extensions. The timestamps are used for two distinct mechanisms: + RTTM (Round Trip Time Measurement) and PAWS (Protect Against Wrapped + Sequences). Selective acknowledgments are not included in this memo. + + This memo combines and supersedes RFC-1072 and RFC-1185, adding + additional clarification and more detailed specification. Appendix C + summarizes the changes from the earlier RFCs. + +TABLE OF CONTENTS + + 1. Introduction ................................................. 2 + 2. TCP Window Scale Option ...................................... 8 + 3. RTTM -- Round-Trip Time Measurement .......................... 11 + 4. PAWS -- Protect Against Wrapped Sequence Numbers ............. 17 + 5. Conclusions and Acknowledgments .............................. 25 + 6. References ................................................... 25 + APPENDIX A: Implementation Suggestions ........................... 27 + APPENDIX B: Duplicates from Earlier Connection Incarnations ...... 27 + APPENDIX C: Changes from RFC-1072, RFC-1185 ...................... 30 + APPENDIX D: Summary of Notation .................................. 31 + APPENDIX E: Event Processing ..................................... 32 + Security Considerations .......................................... 37 + + + +Jacobson, Braden, & Borman [Page 1] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + Authors' Addresses ............................................... 37 + +1. INTRODUCTION + + The TCP protocol [Postel81] was designed to operate reliably over + almost any transmission medium regardless of transmission rate, + delay, corruption, duplication, or reordering of segments. + Production TCP implementations currently adapt to transfer rates in + the range of 100 bps to 10**7 bps and round-trip delays in the range + 1 ms to 100 seconds. Recent work on TCP performance has shown that + TCP can work well over a variety of Internet paths, ranging from 800 + Mbit/sec I/O channels to 300 bit/sec dial-up modems [Jacobson88a]. + + The introduction of fiber optics is resulting in ever-higher + transmission speeds, and the fastest paths are moving out of the + domain for which TCP was originally engineered. This memo defines a + set of modest extensions to TCP to extend the domain of its + application to match this increasing network capability. It is based + upon and obsoletes RFC-1072 [Jacobson88b] and RFC-1185 [Jacobson90b]. + + There is no one-line answer to the question: "How fast can TCP go?". + There are two separate kinds of issues, performance and reliability, + and each depends upon different parameters. We discuss each in turn. + + 1.1 TCP Performance + + TCP performance depends not upon the transfer rate itself, but + rather upon the product of the transfer rate and the round-trip + delay. This "bandwidth*delay product" measures the amount of data + that would "fill the pipe"; it is the buffer space required at + sender and receiver to obtain maximum throughput on the TCP + connection over the path, i.e., the amount of unacknowledged data + that TCP must handle in order to keep the pipeline full. TCP + performance problems arise when the bandwidth*delay product is + large. We refer to an Internet path operating in this region as a + "long, fat pipe", and a network containing this path as an "LFN" + (pronounced "elephan(t)"). + + High-capacity packet satellite channels (e.g., DARPA's Wideband + Net) are LFN's. For example, a DS1-speed satellite channel has a + bandwidth*delay product of 10**6 bits or more; this corresponds to + 100 outstanding TCP segments of 1200 bytes each. Terrestrial + fiber-optical paths will also fall into the LFN class; for + example, a cross-country delay of 30 ms at a DS3 bandwidth + (45Mbps) also exceeds 10**6 bits. + + There are three fundamental performance problems with the current + TCP over LFN paths: + + + +Jacobson, Braden, & Borman [Page 2] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + (1) Window Size Limit + + The TCP header uses a 16 bit field to report the receive + window size to the sender. Therefore, the largest window + that can be used is 2**16 = 65K bytes. + + To circumvent this problem, Section 2 of this memo defines a + new TCP option, "Window Scale", to allow windows larger than + 2**16. This option defines an implicit scale factor, which + is used to multiply the window size value found in a TCP + header to obtain the true window size. + + (2) Recovery from Losses + + Packet losses in an LFN can have a catastrophic effect on + throughput. Until recently, properly-operating TCP + implementations would cause the data pipeline to drain with + every packet loss, and require a slow-start action to + recover. Recently, the Fast Retransmit and Fast Recovery + algorithms [Jacobson90c] have been introduced. Their + combined effect is to recover from one packet loss per + window, without draining the pipeline. However, more than + one packet loss per window typically results in a + retransmission timeout and the resulting pipeline drain and + slow start. + + Expanding the window size to match the capacity of an LFN + results in a corresponding increase of the probability of + more than one packet per window being dropped. This could + have a devastating effect upon the throughput of TCP over an + LFN. In addition, if a congestion control mechanism based + upon some form of random dropping were introduced into + gateways, randomly spaced packet drops would become common, + possible increasing the probability of dropping more than one + packet per window. + + To generalize the Fast Retransmit/Fast Recovery mechanism to + handle multiple packets dropped per window, selective + acknowledgments are required. Unlike the normal cumulative + acknowledgments of TCP, selective acknowledgments give the + sender a complete picture of which segments are queued at the + receiver and which have not yet arrived. Some evidence in + favor of selective acknowledgments has been published + [NBS85], and selective acknowledgments have been included in + a number of experimental Internet protocols -- VMTP + [Cheriton88], NETBLT [Clark87], and RDP [Velten84], and + proposed for OSI TP4 [NBS85]. However, in the non-LFN + regime, selective acknowledgments reduce the number of + + + +Jacobson, Braden, & Borman [Page 3] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + packets retransmitted but do not otherwise improve + performance, making their complexity of questionable value. + However, selective acknowledgments are expected to become + much more important in the LFN regime. + + RFC-1072 defined a new TCP "SACK" option to send a selective + acknowledgment. However, there are important technical + issues to be worked out concerning both the format and + semantics of the SACK option. Therefore, SACK has been + omitted from this package of extensions. It is hoped that + SACK can "catch up" during the standardization process. + + (3) Round-Trip Measurement + + TCP implements reliable data delivery by retransmitting + segments that are not acknowledged within some retransmission + timeout (RTO) interval. Accurate dynamic determination of an + appropriate RTO is essential to TCP performance. RTO is + determined by estimating the mean and variance of the + measured round-trip time (RTT), i.e., the time interval + between sending a segment and receiving an acknowledgment for + it [Jacobson88a]. + + Section 4 introduces a new TCP option, "Timestamps", and then + defines a mechanism using this option that allows nearly + every segment, including retransmissions, to be timed at + negligible computational cost. We use the mnemonic RTTM + (Round Trip Time Measurement) for this mechanism, to + distinguish it from other uses of the Timestamps option. + + + 1.2 TCP Reliability + + Now we turn from performance to reliability. High transfer rate + enters TCP performance through the bandwidth*delay product. + However, high transfer rate alone can threaten TCP reliability by + violating the assumptions behind the TCP mechanism for duplicate + detection and sequencing. + + An especially serious kind of error may result from an accidental + reuse of TCP sequence numbers in data segments. Suppose that an + "old duplicate segment", e.g., a duplicate data segment that was + delayed in Internet queues, is delivered to the receiver at the + wrong moment, so that its sequence numbers falls somewhere within + the current window. There would be no checksum failure to warn of + the error, and the result could be an undetected corruption of the + data. Reception of an old duplicate ACK segment at the + transmitter could be only slightly less serious: it is likely to + + + +Jacobson, Braden, & Borman [Page 4] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + lock up the connection so that no further progress can be made, + forcing an RST on the connection. + + TCP reliability depends upon the existence of a bound on the + lifetime of a segment: the "Maximum Segment Lifetime" or MSL. An + MSL is generally required by any reliable transport protocol, + since every sequence number field must be finite, and therefore + any sequence number may eventually be reused. In the Internet + protocol suite, the MSL bound is enforced by an IP-layer + mechanism, the "Time-to-Live" or TTL field. + + Duplication of sequence numbers might happen in either of two + ways: + + (1) Sequence number wrap-around on the current connection + + A TCP sequence number contains 32 bits. At a high enough + transfer rate, the 32-bit sequence space may be "wrapped" + (cycled) within the time that a segment is delayed in queues. + + (2) Earlier incarnation of the connection + + Suppose that a connection terminates, either by a proper + close sequence or due to a host crash, and the same + connection (i.e., using the same pair of sockets) is + immediately reopened. A delayed segment from the terminated + connection could fall within the current window for the new + incarnation and be accepted as valid. + + Duplicates from earlier incarnations, Case (2), are avoided by + enforcing the current fixed MSL of the TCP spec, as explained in + Section 5.3 and Appendix B. However, case (1), avoiding the + reuse of sequence numbers within the same connection, requires an + MSL bound that depends upon the transfer rate, and at high enough + rates, a new mechanism is required. + + More specifically, if the maximum effective bandwidth at which TCP + is able to transmit over a particular path is B bytes per second, + then the following constraint must be satisfied for error-free + operation: + + 2**31 / B > MSL (secs) [1] + + The following table shows the value for Twrap = 2**31/B in + seconds, for some important values of the bandwidth B: + + + + + + +Jacobson, Braden, & Borman [Page 5] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + Network B*8 B Twrap + bits/sec bytes/sec secs + _______ _______ ______ ______ + + ARPANET 56kbps 7KBps 3*10**5 (~3.6 days) + + DS1 1.5Mbps 190KBps 10**4 (~3 hours) + + Ethernet 10Mbps 1.25MBps 1700 (~30 mins) + + DS3 45Mbps 5.6MBps 380 + + FDDI 100Mbps 12.5MBps 170 + + Gigabit 1Gbps 125MBps 17 + + + It is clear that wrap-around of the sequence space is not a + problem for 56kbps packet switching or even 10Mbps Ethernets. On + the other hand, at DS3 and FDDI speeds, Twrap is comparable to the + 2 minute MSL assumed by the TCP specification [Postel81]. Moving + towards gigabit speeds, Twrap becomes too small for reliable + enforcement by the Internet TTL mechanism. + + The 16-bit window field of TCP limits the effective bandwidth B to + 2**16/RTT, where RTT is the round-trip time in seconds + [McKenzie89]. If the RTT is large enough, this limits B to a + value that meets the constraint [1] for a large MSL value. For + example, consider a transcontinental backbone with an RTT of 60ms + (set by the laws of physics). With the bandwidth*delay product + limited to 64KB by the TCP window size, B is then limited to + 1.1MBps, no matter how high the theoretical transfer rate of the + path. This corresponds to cycling the sequence number space in + Twrap= 2000 secs, which is safe in today's Internet. + + It is important to understand that the culprit is not the larger + window but rather the high bandwidth. For example, consider a + (very large) FDDI LAN with a diameter of 10km. Using the speed of + light, we can compute the RTT across the ring as + (2*10**4)/(3*10**8) = 67 microseconds, and the delay*bandwidth + product is then 833 bytes. A TCP connection across this LAN using + a window of only 833 bytes will run at the full 100mbps and can + wrap the sequence space in about 3 minutes, very close to the MSL + of TCP. Thus, high speed alone can cause a reliability problem + with sequence number wrap-around, even without extended windows. + + Watson's Delta-T protocol [Watson81] includes network-layer + mechanisms for precise enforcement of an MSL. In contrast, the IP + + + +Jacobson, Braden, & Borman [Page 6] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + mechanism for MSL enforcement is loosely defined and even more + loosely implemented in the Internet. Therefore, it is unwise to + depend upon active enforcement of MSL for TCP connections, and it + is unrealistic to imagine setting MSL's smaller than the current + values (e.g., 120 seconds specified for TCP). + + A possible fix for the problem of cycling the sequence space would + be to increase the size of the TCP sequence number field. For + example, the sequence number field (and also the acknowledgment + field) could be expanded to 64 bits. This could be done either by + changing the TCP header or by means of an additional option. + + Section 5 presents a different mechanism, which we call PAWS + (Protect Against Wrapped Sequence numbers), to extend TCP + reliability to transfer rates well beyond the foreseeable upper + limit of network bandwidths. PAWS uses the TCP Timestamps option + defined in Section 4 to protect against old duplicates from the + same connection. + + 1.3 Using TCP options + + The extensions defined in this memo all use new TCP options. We + must address two possible issues concerning the use of TCP + options: (1) compatibility and (2) overhead. + + We must pay careful attention to compatibility, i.e., to + interoperation with existing implementations. The only TCP option + defined previously, MSS, may appear only on a SYN segment. Every + implementation should (and we expect that most will) ignore + unknown options on SYN segments. However, some buggy TCP + implementation might be crashed by the first appearance of an + option on a non-SYN segment. Therefore, for each of the + extensions defined below, TCP options will be sent on non-SYN + segments only when an exchange of options on the SYN segments has + indicated that both sides understand the extension. Furthermore, + an extension option will be sent in a segment only if + the corresponding option was received in the initial + segment. + + A question may be raised about the bandwidth and processing + overhead for TCP options. Those options that occur on SYN + segments are not likely to cause a performance concern. Opening a + TCP connection requires execution of significant special-case + code, and the processing of options is unlikely to increase that + cost significantly. + + On the other hand, a Timestamps option may appear in any data or + ACK segment, adding 12 bytes to the 20-byte TCP header. We + + + +Jacobson, Braden, & Borman [Page 7] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + believe that the bandwidth saved by reducing unnecessary + retransmissions will more than pay for the extra header bandwidth. + + There is also an issue about the processing overhead for parsing + the variable byte-aligned format of options, particularly with a + RISC-architecture CPU. To meet this concern, Appendix A contains + a recommended layout of the options in TCP headers to achieve + reasonable data field alignment. In the spirit of Header + Prediction, a TCP can quickly test for this layout and if it is + verified then use a fast path. Hosts that use this canonical + layout will effectively use the options as a set of fixed-format + fields appended to the TCP header. However, to retain the + philosophical and protocol framework of TCP options, a TCP must be + prepared to parse an arbitrary options field, albeit with less + efficiency. + + Finally, we observe that most of the mechanisms defined in this + memo are important for LFN's and/or very high-speed networks. For + low-speed networks, it might be a performance optimization to NOT + use these mechanisms. A TCP vendor concerned about optimal + performance over low-speed paths might consider turning these + extensions off for low-speed paths, or allow a user or + installation manager to disable them. + + +2. TCP WINDOW SCALE OPTION + + 2.1 Introduction + + The window scale extension expands the definition of the TCP + window to 32 bits and then uses a scale factor to carry this 32- + bit value in the 16-bit Window field of the TCP header (SEG.WND in + RFC-793). The scale factor is carried in a new TCP option, Window + Scale. This option is sent only in a SYN segment (a segment with + the SYN bit on), hence the window scale is fixed in each direction + when a connection is opened. (Another design choice would be to + specify the window scale in every TCP segment. It would be + incorrect to send a window scale option only when the scale factor + changed, since a TCP option in an acknowledgement segment will not + be delivered reliably (unless the ACK happens to be piggy-backed + on data in the other direction). Fixing the scale when the + connection is opened has the advantage of lower overhead but the + disadvantage that the scale factor cannot be changed during the + connection.) + + The maximum receive window, and therefore the scale factor, is + determined by the maximum receive buffer space. In a typical + modern implementation, this maximum buffer space is set by default + + + +Jacobson, Braden, & Borman [Page 8] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + but can be overridden by a user program before a TCP connection is + opened. This determines the scale factor, and therefore no new + user interface is needed for window scaling. + + 2.2 Window Scale Option + + The three-byte Window Scale option may be sent in a SYN segment by + a TCP. It has two purposes: (1) indicate that the TCP is prepared + to do both send and receive window scaling, and (2) communicate a + scale factor to be applied to its receive window. Thus, a TCP + that is prepared to scale windows should send the option, even if + its own scale factor is 1. The scale factor is limited to a power + of two and encoded logarithmically, so it may be implemented by + binary shift operations. + + + TCP Window Scale Option (WSopt): + + Kind: 3 Length: 3 bytes + + +---------+---------+---------+ + | Kind=3 |Length=3 |shift.cnt| + +---------+---------+---------+ + + + This option is an offer, not a promise; both sides must send + Window Scale options in their SYN segments to enable window + scaling in either direction. If window scaling is enabled, + then the TCP that sent this option will right-shift its true + receive-window values by 'shift.cnt' bits for transmission in + SEG.WND. The value 'shift.cnt' may be zero (offering to scale, + while applying a scale factor of 1 to the receive window). + + This option may be sent in an initial segment (i.e., a + segment with the SYN bit on and the ACK bit off). It may also + be sent in a segment, but only if a Window Scale op- + tion was received in the initial segment. A Window Scale + option in a segment without a SYN bit should be ignored. + + The Window field in a SYN (i.e., a or ) segment + itself is never scaled. + + 2.3 Using the Window Scale Option + + A model implementation of window scaling is as follows, using the + notation of RFC-793 [Postel81]: + + * All windows are treated as 32-bit quantities for storage in + + + +Jacobson, Braden, & Borman [Page 9] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + the connection control block and for local calculations. + This includes the send-window (SND.WND) and the receive- + window (RCV.WND) values, as well as the congestion window. + + * The connection state is augmented by two window shift counts, + Snd.Wind.Scale and Rcv.Wind.Scale, to be applied to the + incoming and outgoing window fields, respectively. + + * If a TCP receives a segment containing a Window Scale + option, it sends its own Window Scale option in the + segment. + + * The Window Scale option is sent with shift.cnt = R, where R + is the value that the TCP would like to use for its receive + window. + + * Upon receiving a SYN segment with a Window Scale option + containing shift.cnt = S, a TCP sets Snd.Wind.Scale to S and + sets Rcv.Wind.Scale to R; otherwise, it sets both + Snd.Wind.Scale and Rcv.Wind.Scale to zero. + + * The window field (SEG.WND) in the header of every incoming + segment, with the exception of SYN segments, is left-shifted + by Snd.Wind.Scale bits before updating SND.WND: + + SND.WND = SEG.WND << Snd.Wind.Scale + + (assuming the other conditions of RFC793 are met, and using + the "C" notation "<<" for left-shift). + + * The window field (SEG.WND) of every outgoing segment, with + the exception of SYN segments, is right-shifted by + Rcv.Wind.Scale bits: + + SEG.WND = RCV.WND >> Rcv.Wind.Scale. + + + TCP determines if a data segment is "old" or "new" by testing + whether its sequence number is within 2**31 bytes of the left edge + of the window, and if it is not, discarding the data as "old". To + insure that new data is never mistakenly considered old and vice- + versa, the left edge of the sender's window has to be at most + 2**31 away from the right edge of the receiver's window. + Similarly with the sender's right edge and receiver's left edge. + Since the right and left edges of either the sender's or + receiver's window differ by the window size, and since the sender + and receiver windows can be out of phase by at most the window + size, the above constraints imply that 2 * the max window size + + + +Jacobson, Braden, & Borman [Page 10] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + must be less than 2**31, or + + max window < 2**30 + + Since the max window is 2**S (where S is the scaling shift count) + times at most 2**16 - 1 (the maximum unscaled window), the maximum + window is guaranteed to be < 2*30 if S <= 14. Thus, the shift + count must be limited to 14 (which allows windows of 2**30 = 1 + Gbyte). If a Window Scale option is received with a shift.cnt + value exceeding 14, the TCP should log the error but use 14 + instead of the specified value. + + The scale factor applies only to the Window field as transmitted + in the TCP header; each TCP using extended windows will maintain + the window values locally as 32-bit numbers. For example, the + "congestion window" computed by Slow Start and Congestion + Avoidance is not affected by the scale factor, so window scaling + will not introduce quantization into the congestion window. + +3. RTTM: ROUND-TRIP TIME MEASUREMENT + + 3.1 Introduction + + Accurate and current RTT estimates are necessary to adapt to + changing traffic conditions and to avoid an instability known as + "congestion collapse" [Nagle84] in a busy network. However, + accurate measurement of RTT may be difficult both in theory and in + implementation. + + Many TCP implementations base their RTT measurements upon a sample + of only one packet per window. While this yields an adequate + approximation to the RTT for small windows, it results in an + unacceptably poor RTT estimate for an LFN. If we look at RTT + estimation as a signal processing problem (which it is), a data + signal at some frequency, the packet rate, is being sampled at a + lower frequency, the window rate. This lower sampling frequency + violates Nyquist's criteria and may therefore introduce "aliasing" + artifacts into the estimated RTT [Hamming77]. + + A good RTT estimator with a conservative retransmission timeout + calculation can tolerate aliasing when the sampling frequency is + "close" to the data frequency. For example, with a window of 8 + packets, the sample rate is 1/8 the data frequency -- less than an + order of magnitude different. However, when the window is tens or + hundreds of packets, the RTT estimator may be seriously in error, + resulting in spurious retransmissions. + + If there are dropped packets, the problem becomes worse. Zhang + + + +Jacobson, Braden, & Borman [Page 11] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + [Zhang86], Jain [Jain86] and Karn [Karn87] have shown that it is + not possible to accumulate reliable RTT estimates if retransmitted + segments are included in the estimate. Since a full window of + data will have been transmitted prior to a retransmission, all of + the segments in that window will have to be ACKed before the next + RTT sample can be taken. This means at least an additional + window's worth of time between RTT measurements and, as the error + rate approaches one per window of data (e.g., 10**-6 errors per + bit for the Wideband satellite network), it becomes effectively + impossible to obtain a valid RTT measurement. + + A solution to these problems, which actually simplifies the sender + substantially, is as follows: using TCP options, the sender places + a timestamp in each data segment, and the receiver reflects these + timestamps back in ACK segments. Then a single subtract gives the + sender an accurate RTT measurement for every ACK segment (which + will correspond to every other data segment, with a sensible + receiver). We call this the RTTM (Round-Trip Time Measurement) + mechanism. + + It is vitally important to use the RTTM mechanism with big + windows; otherwise, the door is opened to some dangerous + instabilities due to aliasing. Furthermore, the option is + probably useful for all TCP's, since it simplifies the sender. + + 3.2 TCP Timestamps Option + + TCP is a symmetric protocol, allowing data to be sent at any time + in either direction, and therefore timestamp echoing may occur in + either direction. For simplicity and symmetry, we specify that + timestamps always be sent and echoed in both directions. For + efficiency, we combine the timestamp and timestamp reply fields + into a single TCP Timestamps Option. + + + + + + + + + + + + + + + + + + +Jacobson, Braden, & Borman [Page 12] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + TCP Timestamps Option (TSopt): + + Kind: 8 + + Length: 10 bytes + + +-------+-------+---------------------+---------------------+ + |Kind=8 | 10 | TS Value (TSval) |TS Echo Reply (TSecr)| + +-------+-------+---------------------+---------------------+ + 1 1 4 4 + + The Timestamps option carries two four-byte timestamp fields. + The Timestamp Value field (TSval) contains the current value of + the timestamp clock of the TCP sending the option. + + The Timestamp Echo Reply field (TSecr) is only valid if the ACK + bit is set in the TCP header; if it is valid, it echos a times- + tamp value that was sent by the remote TCP in the TSval field + of a Timestamps option. When TSecr is not valid, its value + must be zero. The TSecr value will generally be from the most + recent Timestamp option that was received; however, there are + exceptions that are explained below. + + A TCP may send the Timestamps option (TSopt) in an initial + segment (i.e., segment containing a SYN bit and no ACK + bit), and may send a TSopt in other segments only if it re- + ceived a TSopt in the initial segment for the connection. + + 3.3 The RTTM Mechanism + + The timestamp value to be sent in TSval is to be obtained from a + (virtual) clock that we call the "timestamp clock". Its values + must be at least approximately proportional to real time, in order + to measure actual RTT. + + The following example illustrates a one-way data flow with + segments arriving in sequence without loss. Here A, B, C... + represent data blocks occupying successive blocks of sequence + numbers, and ACK(A),... represent the corresponding cumulative + acknowledgments. The two timestamp fields of the Timestamps + option are shown symbolically as . Each TSecr + field contains the value most recently received in a TSval field. + + + + + + + + + +Jacobson, Braden, & Borman [Page 13] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + + TCP A TCP B + + ------> + + <---- + + ------> + + <---- + + . . . . . . . . . . . . . . . . . . . . . . + + ------> + + <---- + + (etc) + + + The dotted line marks a pause (60 time units long) in which A had + nothing to send. Note that this pause inflates the RTT which B + could infer from receiving TSecr=131 in data segment C. Thus, in + one-way data flows, RTTM in the reverse direction measures a value + that is inflated by gaps in sending data. However, the following + rule prevents a resulting inflation of the measured RTT: + + A TSecr value received in a segment is used to update the + averaged RTT measurement only if the segment acknowledges + some new data, i.e., only if it advances the left edge of the + send window. + + Since TCP B is not sending data, the data segment C does not + acknowledge any new data when it arrives at B. Thus, the inflated + RTTM measurement is not used to update B's RTTM measurement. + + 3.4 Which Timestamp to Echo + + If more than one Timestamps option is received before a reply + segment is sent, the TCP must choose only one of the TSvals to + echo, ignoring the others. To minimize the state kept in the + receiver (i.e., the number of unprocessed TSvals), the receiver + should be required to retain at most one timestamp in the + connection control block. + + + + + + + +Jacobson, Braden, & Borman [Page 14] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + There are three situations to consider: + + (A) Delayed ACKs. + + Many TCP's acknowledge only every Kth segment out of a group + of segments arriving within a short time interval; this + policy is known generally as "delayed ACKs". The data-sender + TCP must measure the effective RTT, including the additional + time due to delayed ACKs, or else it will retransmit + unnecessarily. Thus, when delayed ACKs are in use, the + receiver should reply with the TSval field from the earliest + unacknowledged segment. + + (B) A hole in the sequence space (segment(s) have been lost). + + The sender will continue sending until the window is filled, + and the receiver may be generating ACKs as these out-of-order + segments arrive (e.g., to aid "fast retransmit"). + + The lost segment is probably a sign of congestion, and in + that situation the sender should be conservative about + retransmission. Furthermore, it is better to overestimate + than underestimate the RTT. An ACK for an out-of-order + segment should therefore contain the timestamp from the most + recent segment that advanced the window. + + The same situation occurs if segments are re-ordered by the + network. + + (C) A filled hole in the sequence space. + + The segment that fills the hole represents the most recent + measurement of the network characteristics. On the other + hand, an RTT computed from an earlier segment would probably + include the sender's retransmit time-out, badly biasing the + sender's average RTT estimate. Thus, the timestamp from the + latest segment (which filled the hole) must be echoed. + + An algorithm that covers all three cases is described in the + following rules for Timestamps option processing on a synchronized + connection: + + (1) The connection state is augmented with two 32-bit slots: + TS.Recent holds a timestamp to be echoed in TSecr whenever a + segment is sent, and Last.ACK.sent holds the ACK field from + the last segment sent. Last.ACK.sent will equal RCV.NXT + except when ACKs have been delayed. + + + + +Jacobson, Braden, & Borman [Page 15] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + (2) If Last.ACK.sent falls within the range of sequence numbers + of an incoming segment: + + SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN + + then the TSval from the segment is copied to TS.Recent; + otherwise, the TSval is ignored. + + (3) When a TSopt is sent, its TSecr field is set to the current + TS.Recent value. + + The following examples illustrate these rules. Here A, B, C... + represent data segments occupying successive blocks of sequence + numbers, and ACK(A),... represent the corresponding + acknowledgment segments. Note that ACK(A) has the same sequence + number as B. We show only one direction of timestamp echoing, for + clarity. + + + o Packets arrive in sequence, and some of the ACKs are delayed. + + By Case (A), the timestamp from the oldest unacknowledged + segment is echoed. + + TS.Recent + -------------------> + 1 + -------------------> + 1 + -------------------> + 1 + <---- + (etc) + + o Packets arrive out of order, and every packet is + acknowledged. + + By Case (B), the timestamp from the last segment that + advanced the left window edge is echoed, until the missing + segment arrives; it is echoed according to Case (C). The + same sequence would occur if segments B and D were lost and + retransmitted.. + + + + + + + + + +Jacobson, Braden, & Borman [Page 16] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + TS.Recent + -------------------> + 1 + <---- + 1 + -------------------> + 1 + <---- + 1 + -------------------> + 2 + <---- + 2 + -------------------> + 2 + <---- + 2 + -------------------> + 4 + <---- + (etc) + + + + +4. PAWS: PROTECT AGAINST WRAPPED SEQUENCE NUMBERS + + 4.1 Introduction + + Section 4.2 describes a simple mechanism to reject old duplicate + segments that might corrupt an open TCP connection; we call this + mechanism PAWS (Protect Against Wrapped Sequence numbers). PAWS + operates within a single TCP connection, using state that is saved + in the connection control block. Section 4.3 and Appendix C + discuss the implications of the PAWS mechanism for avoiding old + duplicates from previous incarnations of the same connection. + + 4.2 The PAWS Mechanism + + PAWS uses the same TCP Timestamps option as the RTTM mechanism + described earlier, and assumes that every received TCP segment + (including data and ACK segments) contains a timestamp SEG.TSval + whose values are monotone non-decreasing in time. The basic idea + is that a segment can be discarded as an old duplicate if it is + received with a timestamp SEG.TSval less than some timestamp + recently received on this connection. + + In both the PAWS and the RTTM mechanism, the "timestamps" are 32- + + + +Jacobson, Braden, & Borman [Page 17] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + bit unsigned integers in a modular 32-bit space. Thus, "less + than" is defined the same way it is for TCP sequence numbers, and + the same implementation techniques apply. If s and t are + timestamp values, s < t if 0 < (t - s) < 2**31, computed in + unsigned 32-bit arithmetic. + + The choice of incoming timestamps to be saved for this comparison + must guarantee a value that is monotone increasing. For example, + we might save the timestamp from the segment that last advanced + the left edge of the receive window, i.e., the most recent in- + sequence segment. Instead, we choose the value TS.Recent + introduced in Section 3.4 for the RTTM mechanism, since using a + common value for both PAWS and RTTM simplifies the implementation + of both. As Section 3.4 explained, TS.Recent differs from the + timestamp from the last in-sequence segment only in the case of + delayed ACKs, and therefore by less than one window. Either + choice will therefore protect against sequence number wrap-around. + + RTTM was specified in a symmetrical manner, so that TSval + timestamps are carried in both data and ACK segments and are + echoed in TSecr fields carried in returning ACK or data segments. + PAWS submits all incoming segments to the same test, and therefore + protects against duplicate ACK segments as well as data segments. + (An alternative un-symmetric algorithm would protect against old + duplicate ACKs: the sender of data would reject incoming ACK + segments whose TSecr values were less than the TSecr saved from + the last segment whose ACK field advanced the left edge of the + send window. This algorithm was deemed to lack economy of + mechanism and symmetry.) + + TSval timestamps sent on {SYN} and {SYN,ACK} segments are used to + initialize PAWS. PAWS protects against old duplicate non-SYN + segments, and duplicate SYN segments received while there is a + synchronized connection. Duplicate {SYN} and {SYN,ACK} segments + received when there is no connection will be discarded by the + normal 3-way handshake and sequence number checks of TCP. + + It is recommended that RST segments NOT carry timestamps, and that + RST segments be acceptable regardless of their timestamp. Old + duplicate RST segments should be exceedingly unlikely, and their + cleanup function should take precedence over timestamps. + + 4.2.1 Basic PAWS Algorithm + + The PAWS algorithm requires the following processing to be + performed on all incoming segments for a synchronized + connection: + + + + +Jacobson, Braden, & Borman [Page 18] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + R1) If there is a Timestamps option in the arriving segment + and SEG.TSval < TS.Recent and if TS.Recent is valid (see + later discussion), then treat the arriving segment as not + acceptable: + + Send an acknowledgement in reply as specified in + RFC-793 page 69 and drop the segment. + + Note: it is necessary to send an ACK segment in order + to retain TCP's mechanisms for detecting and + recovering from half-open connections. For example, + see Figure 10 of RFC-793. + + R2) If the segment is outside the window, reject it (normal + TCP processing) + + R3) If an arriving segment satisfies: SEG.SEQ <= Last.ACK.sent + (see Section 3.4), then record its timestamp in TS.Recent. + + R4) If an arriving segment is in-sequence (i.e., at the left + window edge), then accept it normally. + + R5) Otherwise, treat the segment as a normal in-window, out- + of-sequence TCP segment (e.g., queue it for later delivery + to the user). + + Steps R2, R4, and R5 are the normal TCP processing steps + specified by RFC-793. + + It is important to note that the timestamp is checked only when + a segment first arrives at the receiver, regardless of whether + it is in-sequence or it must be queued for later delivery. + Consider the following example. + + Suppose the segment sequence: A.1, B.1, C.1, ..., Z.1 has + been sent, where the letter indicates the sequence number + and the digit represents the timestamp. Suppose also that + segment B.1 has been lost. The timestamp in TS.TStamp is + 1 (from A.1), so C.1, ..., Z.1 are considered acceptable + and are queued. When B is retransmitted as segment B.2 + (using the latest timestamp), it fills the hole and causes + all the segments through Z to be acknowledged and passed + to the user. The timestamps of the queued segments are + *not* inspected again at this time, since they have + already been accepted. When B.2 is accepted, TS.Stamp is + set to 2. + + This rule allows reasonable performance under loss. A full + + + +Jacobson, Braden, & Borman [Page 19] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + window of data is in transit at all times, and after a loss a + full window less one packet will show up out-of-sequence to be + queued at the receiver (e.g., up to ~2**30 bytes of data); the + timestamp option must not result in discarding this data. + + In certain unlikely circumstances, the algorithm of rules R1-R4 + could lead to discarding some segments unnecessarily, as shown + in the following example: + + Suppose again that segments: A.1, B.1, C.1, ..., Z.1 have + been sent in sequence and that segment B.1 has been lost. + Furthermore, suppose delivery of some of C.1, ... Z.1 is + delayed until AFTER the retransmission B.2 arrives at the + receiver. These delayed segments will be discarded + unnecessarily when they do arrive, since their timestamps + are now out of date. + + This case is very unlikely to occur. If the retransmission was + triggered by a timeout, some of the segments C.1, ... Z.1 must + have been delayed longer than the RTO time. This is presumably + an unlikely event, or there would be many spurious timeouts and + retransmissions. If B's retransmission was triggered by the + "fast retransmit" algorithm, i.e., by duplicate ACKs, then the + queued segments that caused these ACKs must have been received + already. + + Even if a segment were delayed past the RTO, the Fast + Retransmit mechanism [Jacobson90c] will cause the delayed + packets to be retransmitted at the same time as B.2, avoiding + an extra RTT and therefore causing a very small performance + penalty. + + We know of no case with a significant probability of occurrence + in which timestamps will cause performance degradation by + unnecessarily discarding segments. + + 4.2.2 Timestamp Clock + + It is important to understand that the PAWS algorithm does not + require clock synchronization between sender and receiver. The + sender's timestamp clock is used to stamp the segments, and the + sender uses the echoed timestamp to measure RTT's. However, + the receiver treats the timestamp as simply a monotone- + increasing serial number, without any necessary connection to + its clock. From the receiver's viewpoint, the timestamp is + acting as a logical extension of the high-order bits of the + sequence number. + + + + +Jacobson, Braden, & Borman [Page 20] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + The receiver algorithm does place some requirements on the + frequency of the timestamp clock. + + (a) The timestamp clock must not be "too slow". + + It must tick at least once for each 2**31 bytes sent. In + fact, in order to be useful to the sender for round trip + timing, the clock should tick at least once per window's + worth of data, and even with the RFC-1072 window + extension, 2**31 bytes must be at least two windows. + + To make this more quantitative, any clock faster than 1 + tick/sec will reject old duplicate segments for link + speeds of ~8 Gbps. A 1ms timestamp clock will work at + link speeds up to 8 Tbps (8*10**12) bps! + + (b) The timestamp clock must not be "too fast". + + Its recycling time must be greater than MSL seconds. + Since the clock (timestamp) is 32 bits and the worst-case + MSL is 255 seconds, the maximum acceptable clock frequency + is one tick every 59 ns. + + However, it is desirable to establish a much longer + recycle period, in order to handle outdated timestamps on + idle connections (see Section 4.2.3), and to relax the MSL + requirement for preventing sequence number wrap-around. + With a 1 ms timestamp clock, the 32-bit timestamp will + wrap its sign bit in 24.8 days. Thus, it will reject old + duplicates on the same connection if MSL is 24.8 days or + less. This appears to be a very safe figure; an MSL of + 24.8 days or longer can probably be assumed by the gateway + system without requiring precise MSL enforcement by the + TTL value in the IP layer. + + Based upon these considerations, we choose a timestamp clock + frequency in the range 1 ms to 1 sec per tick. This range also + matches the requirements of the RTTM mechanism, which does not + need much more resolution than the granularity of the + retransmit timer, e.g., tens or hundreds of milliseconds. + + The PAWS mechanism also puts a strong monotonicity requirement + on the sender's timestamp clock. The method of implementation + of the timestamp clock to meet this requirement depends upon + the system hardware and software. + + * Some hosts have a hardware clock that is guaranteed to be + monotonic between hardware resets. + + + +Jacobson, Braden, & Borman [Page 21] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + * A clock interrupt may be used to simply increment a binary + integer by 1 periodically. + + * The timestamp clock may be derived from a system clock + that is subject to being abruptly changed, by adding a + variable offset value. This offset is initialized to + zero. When a new timestamp clock value is needed, the + offset can be adjusted as necessary to make the new value + equal to or larger than the previous value (which was + saved for this purpose). + + + 4.2.3 Outdated Timestamps + + If a connection remains idle long enough for the timestamp + clock of the other TCP to wrap its sign bit, then the value + saved in TS.Recent will become too old; as a result, the PAWS + mechanism will cause all subsequent segments to be rejected, + freezing the connection (until the timestamp clock wraps its + sign bit again). + + With the chosen range of timestamp clock frequencies (1 sec to + 1 ms), the time to wrap the sign bit will be between 24.8 days + and 24800 days. A TCP connection that is idle for more than 24 + days and then comes to life is exceedingly unusual. However, + it is undesirable in principle to place any limitation on TCP + connection lifetimes. + + We therefore require that an implementation of PAWS include a + mechanism to "invalidate" the TS.Recent value when a connection + is idle for more than 24 days. (An alternative solution to the + problem of outdated timestamps would be to send keepalive + segments at a very low rate, but still more often than the + wrap-around time for timestamps, e.g., once a day. This would + impose negligible overhead. However, the TCP specification has + never included keepalives, so the solution based upon + invalidation was chosen.) + + Note that a TCP does not know the frequency, and therefore, the + wraparound time, of the other TCP, so it must assume the worst. + The validity of TS.Recent needs to be checked only if the basic + PAWS timestamp check fails, i.e., only if SEG.TSval < + TS.Recent. If TS.Recent is found to be invalid, then the + segment is accepted, regardless of the failure of the timestamp + check, and rule R3 updates TS.Recent with the TSval from the + new segment. + + To detect how long the connection has been idle, the TCP may + + + +Jacobson, Braden, & Borman [Page 22] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + update a clock or timestamp value associated with the + connection whenever TS.Recent is updated, for example. The + details will be implementation-dependent. + + 4.2.4 Header Prediction + + "Header prediction" [Jacobson90a] is a high-performance + transport protocol implementation technique that is most + important for high-speed links. This technique optimizes the + code for the most common case, receiving a segment correctly + and in order. Using header prediction, the receiver asks the + question, "Is this segment the next in sequence?" This + question can be answered in fewer machine instructions than the + question, "Is this segment within the window?" + + Adding header prediction to our timestamp procedure leads to + the following recommended sequence for processing an arriving + TCP segment: + + H1) Check timestamp (same as step R1 above) + + H2) Do header prediction: if segment is next in sequence and + if there are no special conditions requiring additional + processing, accept the segment, record its timestamp, and + skip H3. + + H3) Process the segment normally, as specified in RFC-793. + This includes dropping segments that are outside the win- + dow and possibly sending acknowledgments, and queueing + in-window, out-of-sequence segments. + + Another possibility would be to interchange steps H1 and H2, + i.e., to perform the header prediction step H2 FIRST, and + perform H1 and H3 only when header prediction fails. This + could be a performance improvement, since the timestamp check + in step H1 is very unlikely to fail, and it requires interval + arithmetic on a finite field, a relatively expensive operation. + To perform this check on every single segment is contrary to + the philosophy of header prediction. We believe that this + change might reduce CPU time for TCP protocol processing by up + to 5-10% on high-speed networks. + + However, putting H2 first would create a hazard: a segment from + 2**32 bytes in the past might arrive at exactly the wrong time + and be accepted mistakenly by the header-prediction step. The + following reasoning has been introduced [Jacobson90b] to show + that the probability of this failure is negligible. + + + + +Jacobson, Braden, & Borman [Page 23] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + If all segments are equally likely to show up as old + duplicates, then the probability of an old duplicate + exactly matching the left window edge is the maximum + segment size (MSS) divided by the size of the sequence + space. This ratio must be less than 2**-16, since MSS + must be < 2**16; for example, it will be (2**12)/(2**32) = + 2**-20 for an FDDI link. However, the older a segment is, + the less likely it is to be retained in the Internet, and + under any reasonable model of segment lifetime the + probability of an old duplicate exactly at the left window + edge must be much smaller than 2**-16. + + The 16 bit TCP checksum also allows a basic unreliability + of one part in 2**16. A protocol mechanism whose + reliability exceeds the reliability of the TCP checksum + should be considered "good enough", i.e., it won't + contribute significantly to the overall error rate. We + therefore believe we can ignore the problem of an old + duplicate being accepted by doing header prediction before + checking the timestamp. + + However, this probabilistic argument is not universally + accepted, and the consensus at present is that the performance + gain does not justify the hazard in the general case. It is + therefore recommended that H2 follow H1. + + 4.3. Duplicates from Earlier Incarnations of Connection + + The PAWS mechanism protects against errors due to sequence number + wrap-around on high-speed connection. Segments from an earlier + incarnation of the same connection are also a potential cause of + old duplicate errors. In both cases, the TCP mechanisms to + prevent such errors depend upon the enforcement of a maximum + segment lifetime (MSL) by the Internet (IP) layer (see Appendix of + RFC-1185 for a detailed discussion). Unlike the case of sequence + space wrap-around, the MSL required to prevent old duplicate + errors from earlier incarnations does not depend upon the transfer + rate. If the IP layer enforces the recommended 2 minute MSL of + TCP, and if the TCP rules are followed, TCP connections will be + safe from earlier incarnations, no matter how high the network + speed. Thus, the PAWS mechanism is not required for this case. + + We may still ask whether the PAWS mechanism can provide additional + security against old duplicates from earlier connections, allowing + us to relax the enforcement of MSL by the IP layer. Appendix B + explores this question, showing that further assumptions and/or + mechanisms are required, beyond those of PAWS. This is not part + of the current extension. + + + +Jacobson, Braden, & Borman [Page 24] + +RFC 1323 TCP Extensions for High Performance May 1992 + + +5. CONCLUSIONS AND ACKNOWLEDGMENTS + + This memo presented a set of extensions to TCP to provide efficient + operation over large-bandwidth*delay-product paths and reliable + operation over very high-speed paths. These extensions are designed + to provide compatible interworking with TCP's that do not implement + the extensions. + + These mechanisms are implemented using new TCP options for scaled + windows and timestamps. The timestamps are used for two distinct + mechanisms: RTTM (Round Trip Time Measurement) and PAWS (Protect + Against Wrapped Sequences). + + The Window Scale option was originally suggested by Mike St. Johns of + USAF/DCA. The present form of the option was suggested by Mike + Karels of UC Berkeley in response to a more cumbersome scheme defined + by Van Jacobson. Lixia Zhang helped formulate the PAWS mechanism + description in RFC-1185. + + Finally, much of this work originated as the result of discussions + within the End-to-End Task Force on the theoretical limitations of + transport protocols in general and TCP in particular. More recently, + task force members and other on the end2end-interest list have made + valuable contributions by pointing out flaws in the algorithms and + the documentation. The authors are grateful for all these + contributions. + +6. REFERENCES + + [Clark87] Clark, D., Lambert, M., and L. Zhang, "NETBLT: A Bulk + Data Transfer Protocol", RFC 998, MIT, March 1987. + + [Garlick77] Garlick, L., R. Rom, and J. Postel, "Issues in + Reliable Host-to-Host Protocols", Proc. Second Berkeley Workshop + on Distributed Data Management and Computer Networks, May 1977. + + [Hamming77] Hamming, R., "Digital Filters", ISBN 0-13-212571-4, + Prentice Hall, Englewood Cliffs, N.J., 1977. + + [Cheriton88] Cheriton, D., "VMTP: Versatile Message Transaction + Protocol", RFC 1045, Stanford University, February 1988. + + [Jacobson88a] Jacobson, V., "Congestion Avoidance and Control", + SIGCOMM '88, Stanford, CA., August 1988. + + [Jacobson88b] Jacobson, V., and R. Braden, "TCP Extensions for + Long-Delay Paths", RFC-1072, LBL and USC/Information Sciences + Institute, October 1988. + + + +Jacobson, Braden, & Borman [Page 25] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + [Jacobson90a] Jacobson, V., "4BSD Header Prediction", ACM + Computer Communication Review, April 1990. + + [Jacobson90b] Jacobson, V., Braden, R., and Zhang, L., "TCP + Extension for High-Speed Paths", RFC-1185, LBL and USC/Information + Sciences Institute, October 1990. + + [Jacobson90c] Jacobson, V., "Modified TCP congestion avoidance + algorithm", Message to end2end-interest mailing list, April 1990. + + [Jain86] Jain, R., "Divergence of Timeout Algorithms for Packet + Retransmissions", Proc. Fifth Phoenix Conf. on Comp. and Comm., + Scottsdale, Arizona, March 1986. + + [Karn87] Karn, P. and C. Partridge, "Estimating Round-Trip Times + in Reliable Transport Protocols", Proc. SIGCOMM '87, Stowe, VT, + August 1987. + + [McKenzie89] McKenzie, A., "A Problem with the TCP Big Window + Option", RFC 1110, BBN STC, August 1989. + + [Nagle84] Nagle, J., "Congestion Control in IP/TCP + Internetworks", RFC 896, FACC, January 1984. + + [NBS85] Colella, R., Aronoff, R., and K. Mills, "Performance + Improvements for ISO Transport", Ninth Data Comm Symposium, + published in ACM SIGCOMM Comp Comm Review, vol. 15, no. 5, + September 1985. + + [Postel81] Postel, J., "Transmission Control Protocol - DARPA + Internet Program Protocol Specification", RFC 793, DARPA, + September 1981. + + [Velten84] Velten, D., Hinden, R., and J. Sax, "Reliable Data + Protocol", RFC 908, BBN, July 1984. + + [Watson81] Watson, R., "Timer-based Mechanisms in Reliable + Transport Protocol Connection Management", Computer Networks, Vol. + 5, 1981. + + [Zhang86] Zhang, L., "Why TCP Timers Don't Work Well", Proc. + SIGCOMM '86, Stowe, Vt., August 1986. + + + + + + + + + +Jacobson, Braden, & Borman [Page 26] + +RFC 1323 TCP Extensions for High Performance May 1992 + + +APPENDIX A: IMPLEMENTATION SUGGESTIONS + + The following layouts are recommended for sending options on non-SYN + segments, to achieve maximum feasible alignment of 32-bit and 64-bit + machines. + + + +--------+--------+--------+--------+ + | NOP | NOP | TSopt | 10 | + +--------+--------+--------+--------+ + | TSval timestamp | + +--------+--------+--------+--------+ + | TSecr timestamp | + +--------+--------+--------+--------+ + + +APPENDIX B: DUPLICATES FROM EARLIER CONNECTION INCARNATIONS + + There are two cases to be considered: (1) a system crashing (and + losing connection state) and restarting, and (2) the same connection + being closed and reopened without a loss of host state. These will + be described in the following two sections. + + B.1 System Crash with Loss of State + + TCP's quiet time of one MSL upon system startup handles the loss + of connection state in a system crash/restart. For an + explanation, see for example "When to Keep Quiet" in the TCP + protocol specification [Postel81]. The MSL that is required here + does not depend upon the transfer speed. The current TCP MSL of 2 + minutes seems acceptable as an operational compromise, as many + host systems take this long to boot after a crash. + + However, the timestamp option may be used to ease the MSL + requirements (or to provide additional security against data + corruption). If timestamps are being used and if the timestamp + clock can be guaranteed to be monotonic over a system + crash/restart, i.e., if the first value of the sender's timestamp + clock after a crash/restart can be guaranteed to be greater than + the last value before the restart, then a quiet time will be + unnecessary. + + To dispense totally with the quiet time would require that the + host clock be synchronized to a time source that is stable over + the crash/restart period, with an accuracy of one timestamp clock + tick or better. We can back off from this strict requirement to + take advantage of approximate clock synchronization. Suppose that + the clock is always re-synchronized to within N timestamp clock + + + +Jacobson, Braden, & Borman [Page 27] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + ticks and that booting (extended with a quiet time, if necessary) + takes more than N ticks. This will guarantee monotonicity of the + timestamps, which can then be used to reject old duplicates even + without an enforced MSL. + + B.2 Closing and Reopening a Connection + + When a TCP connection is closed, a delay of 2*MSL in TIME-WAIT + state ties up the socket pair for 4 minutes (see Section 3.5 of + [Postel81]. Applications built upon TCP that close one connection + and open a new one (e.g., an FTP data transfer connection using + Stream mode) must choose a new socket pair each time. The TIME- + WAIT delay serves two different purposes: + + (a) Implement the full-duplex reliable close handshake of TCP. + + The proper time to delay the final close step is not really + related to the MSL; it depends instead upon the RTO for the + FIN segments and therefore upon the RTT of the path. (It + could be argued that the side that is sending a FIN knows + what degree of reliability it needs, and therefore it should + be able to determine the length of the TIME-WAIT delay for + the FIN's recipient. This could be accomplished with an + appropriate TCP option in FIN segments.) + + Although there is no formal upper-bound on RTT, common + network engineering practice makes an RTT greater than 1 + minute very unlikely. Thus, the 4 minute delay in TIME-WAIT + state works satisfactorily to provide a reliable full-duplex + TCP close. Note again that this is independent of MSL + enforcement and network speed. + + The TIME-WAIT state could cause an indirect performance + problem if an application needed to repeatedly close one + connection and open another at a very high frequency, since + the number of available TCP ports on a host is less than + 2**16. However, high network speeds are not the major + contributor to this problem; the RTT is the limiting factor + in how quickly connections can be opened and closed. + Therefore, this problem will be no worse at high transfer + speeds. + + (b) Allow old duplicate segments to expire. + + To replace this function of TIME-WAIT state, a mechanism + would have to operate across connections. PAWS is defined + strictly within a single connection; the last timestamp is + TS.Recent is kept in the connection control block, and + + + +Jacobson, Braden, & Borman [Page 28] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + discarded when a connection is closed. + + An additional mechanism could be added to the TCP, a per-host + cache of the last timestamp received from any connection. + This value could then be used in the PAWS mechanism to reject + old duplicate segments from earlier incarnations of the + connection, if the timestamp clock can be guaranteed to have + ticked at least once since the old connection was open. This + would require that the TIME-WAIT delay plus the RTT together + must be at least one tick of the sender's timestamp clock. + Such an extension is not part of the proposal of this RFC. + + Note that this is a variant on the mechanism proposed by + Garlick, Rom, and Postel [Garlick77], which required each + host to maintain connection records containing the highest + sequence numbers on every connection. Using timestamps + instead, it is only necessary to keep one quantity per remote + host, regardless of the number of simultaneous connections to + that host. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Jacobson, Braden, & Borman [Page 29] + +RFC 1323 TCP Extensions for High Performance May 1992 + + +APPENDIX C: CHANGES FROM RFC-1072, RFC-1185 + + The protocol extensions defined in this document differ in several + important ways from those defined in RFC-1072 and RFC-1185. + + (a) SACK has been deferred to a later memo. + + (b) The detailed rules for sending timestamp replies (see Section + 3.4) differ in important ways. The earlier rules could result + in an under-estimate of the RTT in certain cases (packets + dropped or out of order). + + (c) The same value TS.Recent is now shared by the two distinct + mechanisms RTTM and PAWS. This simplification became possible + because of change (b). + + (d) An ambiguity in RFC-1185 was resolved in favor of putting + timestamps on ACK as well as data segments. This supports the + symmetry of the underlying TCP protocol. + + (e) The echo and echo reply options of RFC-1072 were combined into a + single Timestamps option, to reflect the symmetry and to + simplify processing. + + (f) The problem of outdated timestamps on long-idle connections, + discussed in Section 4.2.2, was realized and resolved. + + (g) RFC-1185 recommended that header prediction take precedence over + the timestamp check. Based upon some scepticism about the + probabilistic arguments given in Section 4.2.4, it was decided + to recommend that the timestamp check be performed first. + + (h) The spec was modified so that the extended options will be sent + on segments only when they are received in the + corresponding segments. This provides the most + conservative possible conditions for interoperation with + implementations without the extensions. + + In addition to these substantive changes, the present RFC attempts to + specify the algorithms unambiguously by presenting modifications to + the Event Processing rules of RFC-793; see Appendix E. + + + + + + + + + + +Jacobson, Braden, & Borman [Page 30] + +RFC 1323 TCP Extensions for High Performance May 1992 + + +APPENDIX D: SUMMARY OF NOTATION + + The following notation has been used in this document. + + Options + + WSopt: TCP Window Scale Option + TSopt: TCP Timestamps Option + + Option Fields + + shift.cnt: Window scale byte in WSopt. + TSval: 32-bit Timestamp Value field in TSopt. + TSecr: 32-bit Timestamp Reply field in TSopt. + + Option Fields in Current Segment + + SEG.TSval: TSval field from TSopt in current segment. + SEG.TSecr: TSecr field from TSopt in current segment. + SEG.WSopt: 8-bit value in WSopt + + Clock Values + + my.TSclock: Local source of 32-bit timestamp values + my.TSclock.rate: Period of my.TSclock (1 ms to 1 sec). + + Per-Connection State Variables + + TS.Recent: Latest received Timestamp + Last.ACK.sent: Last ACK field sent + + Snd.TS.OK: 1-bit flag + Snd.WS.OK: 1-bit flag + + Rcv.Wind.Scale: Receive window scale power + Snd.Wind.Scale: Send window scale power + + + + + + + + + + + + + + + +Jacobson, Braden, & Borman [Page 31] + +RFC 1323 TCP Extensions for High Performance May 1992 + + +APPENDIX E: EVENT PROCESSING + + +Event Processing + + OPEN Call + + ... + An initial send sequence number (ISS) is selected. Send a SYN + segment of the form: + + + + ... + + SEND Call + + CLOSED STATE (i.e., TCB does not exist) + + ... + + LISTEN STATE + + If the foreign socket is specified, then change the connection + from passive to active, select an ISS. Send a SYN segment + containing the options: and + . Set SND.UNA to ISS, SND.NXT to ISS+1. + Enter SYN-SENT state. ... + + SYN-SENT STATE + SYN-RECEIVED STATE + + ... + + ESTABLISHED STATE + CLOSE-WAIT STATE + + Segmentize the buffer and send it with a piggybacked + acknowledgment (acknowledgment value = RCV.NXT). ... + + If the urgent flag is set ... + + If the Snd.TS.OK flag is set, then include the TCP Timestamps + option in each data segment. + + Scale the receive window for transmission in the segment header: + + SEG.WND = (SND.WND >> Rcv.Wind.Scale). + + + +Jacobson, Braden, & Borman [Page 32] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + SEGMENT ARRIVES + + ... + + If the state is LISTEN then + + first check for an RST + + ... + + second check for an ACK + + ... + + third check for a SYN + + if the SYN bit is set, check the security. If the ... + + ... + + If the SEG.PRC is less than the TCB.PRC then continue. + + Check for a Window Scale option (WSopt); if one is found, save + SEG.WSopt in Snd.Wind.Scale and set Snd.WS.OK flag on. + Otherwise, set both Snd.Wind.Scale and Rcv.Wind.Scale to zero + and clear Snd.WS.OK flag. + + Check for a TSopt option; if one is found, save SEG.TSval in the + variable TS.Recent and turn on the Snd.TS.OK bit. + + Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ and any other + control or text should be queued for processing later. ISS + should be selected and a SYN segment sent of the form: + + + + If the Snd.WS.OK bit is on, include a WSopt option + in this segment. If the Snd.TS.OK bit is + on, include a TSopt in this + segment. Last.ACK.sent is set to RCV.NXT. + + SND.NXT is set to ISS+1 and SND.UNA to ISS. The connection + state should be changed to SYN-RECEIVED. Note that any other + incoming control or data (combined with SYN) will be processed + in the SYN-RECEIVED state, but processing of SYN and ACK should + not be repeated. If the listen was not fully specified (i.e., + the foreign socket was not fully specified), then the + unspecified fields should be filled in now. + + + +Jacobson, Braden, & Borman [Page 33] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + fourth other text or control + + ... + + If the state is SYN-SENT then + + first check the ACK bit + + ... + + fourth check the SYN bit + + ... + + If the SYN bit is on and the security/compartment and precedence + are acceptable then, RCV.NXT is set to SEG.SEQ+1, IRS is set to + SEG.SEQ, and any acknowledgements on the retransmission queue + which are thereby acknowledged should be removed. + + Check for a Window Scale option (WSopt); if is found, save + SEG.WSopt in Snd.Wind.Scale; otherwise, set both Snd.Wind.Scale + and Rcv.Wind.Scale to zero. + + Check for a TSopt option; if one is found, save SEG.TSval in + variable TS.Recent and turn on the Snd.TS.OK bit in the + connection control block. If the ACK bit is set, use my.TSclock + - SEG.TSecr as the initial RTT estimate. + + If SND.UNA > ISS (our SYN has been ACKed), change the connection + state to ESTABLISHED, form an ACK segment: + + + + and send it. If the Snd.Echo.OK bit is on, include a TSopt + option in this ACK segment. + Last.ACK.sent is set to RCV.NXT. + + Data or controls which were queued for transmission may be + included. If there are other controls or text in the segment + then continue processing at the sixth step below where the URG + bit is checked, otherwise return. + + Otherwise enter SYN-RECEIVED, form a SYN,ACK segment: + + + + and send it. If the Snd.Echo.OK bit is on, include a TSopt + option in this segment. If + + + +Jacobson, Braden, & Borman [Page 34] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + the Snd.WS.OK bit is on, include a WSopt option + in this segment. Last.ACK.sent is set to + RCV.NXT. + + If there are other controls or text in the segment, queue them + for processing after the ESTABLISHED state has been reached, + return. + + fifth, if neither of the SYN or RST bits is set then drop the + segment and return. + + + Otherwise, + + First, check sequence number + + SYN-RECEIVED STATE + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + CLOSE-WAIT STATE + CLOSING STATE + LAST-ACK STATE + TIME-WAIT STATE + + Segments are processed in sequence. Initial tests on arrival + are used to discard old duplicates, but further processing is + done in SEG.SEQ order. If a segment's contents straddle the + boundary between old and new, only the new parts should be + processed. + + Rescale the received window field: + + TrueWindow = SEG.WND << Snd.Wind.Scale, + + and use "TrueWindow" in place of SEG.WND in the following steps. + + Check whether the segment contains a Timestamps option and bit + Snd.TS.OK is on. If so: + + If SEG.TSval < TS.Recent, then test whether connection has + been idle less than 24 days; if both are true, then the + segment is not acceptable; follow steps below for an + unacceptable segment. + + If SEG.SEQ is equal to Last.ACK.sent, then save SEG.ECopt in + variable TS.Recent. + + + + +Jacobson, Braden, & Borman [Page 35] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + There are four cases for the acceptability test for an incoming + segment: + + ... + + If an incoming segment is not acceptable, an acknowledgment + should be sent in reply (unless the RST bit is set, if so drop + the segment and return): + + + + Last.ACK.sent is set to SEG.ACK of the acknowledgment. If the + Snd.Echo.OK bit is on, include the Timestamps option + in this ACK segment. Set + Last.ACK.sent to SEG.ACK and send the ACK segment. After + sending the acknowledgment, drop the unacceptable segment and + return. + + ... + + fifth check the ACK field. + + if the ACK bit is off drop the segment and return. + + if the ACK bit is on + + ... + + ESTABLISHED STATE + + If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK. + Also compute a new estimate of round-trip time. If Snd.TS.OK + bit is on, use my.TSclock - SEG.TSecr; otherwise use the + elapsed time since the first segment in the retransmission + queue was sent. Any segments on the retransmission queue + which are thereby entirely acknowledged... + + ... + + Seventh, process the segment text. + + ESTABLISHED STATE + FIN-WAIT-1 STATE + FIN-WAIT-2 STATE + + ... + + Send an acknowledgment of the form: + + + +Jacobson, Braden, & Borman [Page 36] + +RFC 1323 TCP Extensions for High Performance May 1992 + + + + + If the Snd.TS.OK bit is on, include Timestamps option + in this ACK segment. Set + Last.ACK.sent to SEG.ACK of the acknowledgment, and send it. + This acknowledgment should be piggy-backed on a segment being + transmitted if possible without incurring undue delay. + + + ... + + +Security Considerations + + Security issues are not discussed in this memo. + +Authors' Addresses + + Van Jacobson + University of California + Lawrence Berkeley Laboratory + Mail Stop 46A + Berkeley, CA 94720 + + Phone: (415) 486-6411 + EMail: van@CSAM.LBL.GOV + + + Bob Braden + University of Southern California + Information Sciences Institute + 4676 Admiralty Way + Marina del Rey, CA 90292 + + Phone: (310) 822-1511 + EMail: Braden@ISI.EDU + + + Dave Borman + Cray Research + 655-E Lone Oak Drive + Eagan, MN 55121 + + Phone: (612) 683-5571 + Email: dab@cray.com + + + + + + +Jacobson, Braden, & Borman [Page 37] + \ No newline at end of file diff --git a/ext/picotcp/RFC/rfc1379.txt b/ext/picotcp/RFC/rfc1379.txt new file mode 100644 index 0000000..b5f2bdc --- /dev/null +++ b/ext/picotcp/RFC/rfc1379.txt @@ -0,0 +1,2131 @@ + + + + + + +Network Working Group R. Braden +Request for Comments: 1379 ISI + November 1992 + + + Extending TCP for Transactions -- Concepts + +Status of This Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard. Distribution of this memo is + unlimited. + +Abstract + + This memo discusses extension of TCP to provide transaction-oriented + service, without altering its virtual-circuit operation. This + extension would fill the large gap between connection-oriented TCP + and datagram-based UDP, allowing TCP to efficiently perform many + applications for which UDP is currently used. A separate memo + contains a detailed functional specification for this proposed + extension. + + This work was supported in part by the National Science Foundation + under Grant Number NCR-8922231. + +TABLE OF CONTENTS + + 1. INTRODUCTION .................................................. 2 + 2. TRANSACTIONS USING STANDARD TCP ............................... 3 + 3. BYPASSING THE 3-WAY HANDSHAKE ................................. 6 + 3.1 Concept of TAO ........................................... 6 + 3.2 Cache Initialization ..................................... 10 + 3.3 Accepting Segments ............................. 11 + 4. SHORTENING TIME-WAIT STATE .................................... 13 + 5. CHOOSING A MONOTONIC SEQUENCE ................................. 15 + 5.1 Cached Timestamps ........................................ 16 + 5.2 Current TCP Sequence Numbers ............................. 18 + 5.3 64-bit Sequence Numbers .................................. 20 + 5.4 Connection Counts ........................................ 20 + 5.5 Conclusions .............................................. 21 + 6. CONNECTION STATES ............................................. 24 + 7. CONCLUSIONS AND ACKNOWLEDGMENTS ............................... 32 + APPENDIX A: TIME-WAIT STATE AND THE 2-PACKET EXCHANGE ............ 34 + REFERENCES ....................................................... 37 + Security Considerations .......................................... 38 + Author's Address ................................................. 38 + + + + +Braden [Page 1] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + +1. INTRODUCTION + + The TCP protocol [STD-007] implements a virtual-circuit transport + service that provides reliable and ordered data delivery over a + full-duplex connection. Under the virtual circuit model, the life of + a connection is divided into three distinct phases: (1) opening the + connection to create a full-duplex byte stream; (2) transferring data + in one or both directions over this stream; and (3) closing the + connection. Remote login and file transfer are examples of + applications that are well suited to virtual-circuit service. + + Distributed applications, which are becoming increasingly numerous + and sophisticated in the Internet, tend to use a transaction-oriented + rather than a virtual circuit style of communication. Currently, a + transaction-oriented Internet application must choose to suffer the + overhead of opening and closing TCP connections or else build an + application-specific transport mechanism on top of the connectionless + transport protocol UDP. Greater convenience, uniformity, and + efficiency would result from widely-available kernel implementations + of a transport protocol supporting a transaction service model [RFC- + 955]. + + The transaction service model has the following features: + + * The fundamental interaction is a request followed by a response. + + * An explicit open or close phase would impose excessive overhead. + + * At-most-once semantics is required; that is, a transaction must + not be "replayed" by a duplicate request packet. + + * In favorable circumstances, a reliable request/response + handshake can be performed with exactly one packet in each + direction. + + * The minimum transaction latency for a client is RTT + SPT, where + RTT is the round-trip time and SPT is the server processing + time. + + We use the term "transaction transport protocol" for a transport- + layer protocol that follows this model [RFC-955]. + + The Internet architecture allows an arbitrary collection of transport + protocols to be defined on top of the minimal end-to-end datagram + service provided by IP [Clark88]. In practice, however, production + systems implement only TCP and UDP at the transport layer. It has + proven difficult to leverage a new transport protocol into place, to + be widely enough available to be useful for application builders. + + + +Braden [Page 2] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + This memo explores an alternative approach to providing a transaction + transport protocol: extending TCP to implement the transaction + service model, while continuing to support the virtual circuit model. + Each transaction will then be a single instance of a TCP connection. + The proposed transaction extension is effectively implementable + within current TCPs and operating systems, and it should also scale + to the much faster networks, interfaces, and CPUs of the future. + + The present memo explains the theory behind the extension, in + somewhat exquisite detail. Despite the length and complexity of this + memo, the TCP extensions required for transactions are in fact quite + limited and simple. Another memo [TTCP-FS] provides a self-contained + functional specification of the extensions. + + Section 2 of this memo describes the limitations of standard TCP for + transaction processing, to motivate the extensions. Sections 3, 4, + and 5 explore the fundamental extensions that are required for + transactions. Section 6 discusses the changes required in the TCP + connection state diagram. Finally, Section 7 presents conclusions + and acknowledgments. Familiarity with the standard TCP protocol + [STD-007] is assumed. + +2. TRANSACTIONS USING STANDARD TCP + + Reliable transfer of data depends upon sequence numbers. Before data + transfer can begin, both parties must "synchronize" the connection, + i.e, agree on common sequence numbers. The synchronization procedure + must preserve at-most-once semantics, i.e., be free from replay + hazards due to duplicate packets. The TCP developers adopted a + synchronization mechanism known as the 3-way handshake. + + Consider a simple transaction in which client host A sends a single- + segment request to server host B, and B returns a single-segment + response. Many current TCP implementations use at least ten segments + (i.e., packets) for this sequence: three for the 3-way handshake + opening the connection, four to send and acknowledge the request and + response data, and three for TCP's full-duplex data-conserving close + sequence. These ten segments represent a high relative overhead for + two data-bearing segments. However, a more important consideration + is the transaction latency seen by the client: 2*RTT + SPT, larger + than the minimum by one RTT. As CPU and network speeds increase, the + relative significance of this extra transaction latency also + increases. + + Proposed transaction transport protocols have typically used a + "timer-based" approach to connection synchronization [Birrell84]. In + this approach, once end-to-end connection state is established in the + client and server hosts, a subset of this state is maintained for + + + +Braden [Page 3] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + some period of time. A new request before the expiration of this + timeout period can then reestablish the full state without an + explicit handshake. Watson pointed out that the timer-based approach + of his Delta-T protocol [Watson81] would encompass both virtual + circuits and transactions. However, the TCP group adopted the 3-way + handshake (because of uncertainty about the robustness of enforcing + the packet lifetime bounds required by Delta-T, within a general + Internet environment). More recently, Liskov, Shrira, and Wroclawski + [Liskov90] have proposed a different timer-based approach to + connection synchronization, requiring loosely-synchronized clocks in + the hosts. + + The technique proposed in this memo, suggested by Clark [Clark89], + depends upon cacheing of connection state but not upon clocks or + timers; it is described in Section 3 below. Garlick, Rom, and Postel + also proposed a connection synchronization mechanism using cached + state [Garlick77]. Their scheme required each host to maintain + connection records containing the highest sequence number on each + connection. The technique suggested here retains only per-host + state, not per-connection state. + + During TCP development, it was suggested that TCP could support + transactions with data segments containing both SYN and FIN bits. + (These "Kamikaze" segments were not supported as a service; they were + used mainly to crash other experimental TCPs!) To illustrate this + idea, Figure 1 shows a plausible application of the current TCP rules + to create a minimal transaction. (In fact, some minor adjustments in + the standard TCP spec would be required to make Figure 1 fully legal + [STD-007]). + + Figure 1, like many of the examples shown in this memo, uses an + abbreviated form to illustrate segment sequences. For clarity and + brevity, it omits explicit sequence and acknowledgment numbers, + assuming that these will follow the well-known TCP rules. The + notation "ACK(x)" implies a cumulative acknowledgment for the control + bit or data "x" and everything preceding "x" in the sequence space. + The referent of "x" should be clear from the context. Also, host A + will always be the client and host B will be the server in these + diagrams. + + The first three segments in Figure 1 implement the standard TCP + three-way handshake. If segment #1 had been an old duplicate, the + client side would have sent an RST (Reset) bit in segment #3, + terminating the sequence. The request data included on the initial + SYN segment cannot be delivered to user B until segment #3 completes + the 3-way handshake. Loading control bits onto the segments has + reduced the total number of segments to 5, but the client still + observes a transaction latency of 2*RTT + SPT. The 3-way handshake + + + +Braden [Page 4] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + thus precludes high-performance transaction processing. + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + (Client sends request) + 1. SYN-SENT --> --> SYN-RCVD + (data1 queued) + + 2. ESTABLISHED <-- <-- SYN-RCVD + + + 3. FIN-WAIT-1 --> --> CLOSE-WAIT + (data1 to server) + + (Server sends reply) + 4. TIME-WAIT <-- <-- LAST-ACK + (data2 to client) + + 5. TIME-WAIT --> --> CLOSED + + (timeout) + CLOSED + + Figure 1: Transaction Sequence: RFC-793 TCP + + + The TCP close sequence also poses a performance problem for + transactions: one or both end(s) of a closed connection must remain + in "TIME-WAIT" state until a 4 minute timeout has expired [STD-007]. + The same connection (defined by the host and port numbers at both + ends) cannot be reopened until this delay has expired. Because of + TIME-WAIT state, a client program should choose a new local port + number (i.e., a different connection) for each successive + transaction. However, the TCP port field of 16 bits (less the + "well-known" port space) provides only 64512 available user ports. + This limits the total rate of transactions between any pair of hosts + to a maximum of 64512/240 = 268 per second. This is much too low a + rate for low-delay paths, e.g., high-speed LANs. A high rate of + short connections (i.e., transactions) could also lead to excessive + consumption of kernel memory by connection control blocks in TIME- + WAIT state. + + In summary, to perform efficient transaction processing in TCP, we + need to suppress the 3-way handshake and to shorten TIME-WAIT state. + + + +Braden [Page 5] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + Protocol mechanisms to accomplish these two goals are discussed in + Sections 3 and 4, respectively. Both require the choice of a + monotonic sequence-like space; Section 5 analyzes the choices and + makes a selection for this space. Finally, the TCP connection state + machine must be extended as described in Section 6. + + Transaction processing in TCP raises some other protocol issues, + which are discussed in the functional specification memo [TTCP-FS]. + These include: + + (1) augmenting the user interface for transactions, + + (2) delaying acknowledgment segments to allow maximum piggy-backing + of control bits with data, + + (3) measuring the retransmission timeout time (RTO) on very short + connections, and + + (4) providing an initial server window. + + A recently proposed set of enhancements [RFC-1323] defines a TCP + Timestamps option that carries two 32-bit timestamp values. The + Timestamps option is used to accurately measure round-trip time + (RTT). The same option is also used in a procedure known as "PAWS" + (Protect Againsts Wrapped Sequence) to prevent erroneous data + delivery due to a combination of old duplicate segments and sequence + number reuse at very high bandwidths. The particular approach to + transactions chosen in this memo does not require the RFC-1323 + enhancements; however, they are important and should be implemented + in every TCP, with or without the transaction extensions described + here. + +3. BYPASSING THE 3-WAY HANDSHAKE + + To avoid 3-way handshakes for transactions, we introduce a new + mechanism for validating initial SYN segments, i.e., for enforcing + at-most-once semantics without a 3-way handshake. We refer to this + as the TCP Accelerated Open, or TAO, mechanism. + + 3.1 Concept of TAO + + The basis of TAO is this: a TCP uses cached per-host information + to immediately validate new SYNs [Clark89]. If this validation + fails, e.g., because there is no current cached state or the + segment is an old duplicate, the procedure falls back to a normal + 3-way handshake to validate the SYN. Thus, bypassing a 3-way + handshake is considered to be an optional optimization. + + + + +Braden [Page 6] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + The proposed TAO mechanism uses a finite sequence-like space of + values that increase monotonically with successive transactions + (connections) between a given (client, server) host pair. Call + this monotonic space M, and let each initial SYN segment carry an + M value SEG.M. If M is not the existing sequence (SEG.SEQ) field, + SEG.M may be carried in a TCP option. + + When host B receives from host A an initial SYN segment containing + a new value SEG.M, host B compares this against cache.M[A], the + latest M value that B has cached for host A. This comparison is + the "TAO test". Because the M values are monotonically + increasing, SEG.M > cache.M[A] implies that the SYN must be new + and can be accepted immediately. If not, a normal 3-way handshake + is performed to validate the initial SYN segment. Figure 2 + illustrates the TAO mechanism; cached M values are shown enclosed + in square brackets. The M values generated by host A satisfy + x0 < x1, and the M values generated by host B satisfy y0 < y1. + + An appropriate choice for the M value space is discussed in + Section 5. M values are drawn from a finite number space, so + inequalities must be defined in the usual way for sequence numbers + [STD-007]. The M space must not wrap so quickly that an old + duplicate SYN will be erroneously accepted. We assume that some + maximum segment lifetime (MSL) is enforced by the IP layer. + + ____T_C_P__A_____ ____T_C_P__B_____ + + cache.M[B] cache.M[A] + V V + + [ y0 ] [ x0 ] + + 1. --> --> ( (x1 > x0) => + data1 -> user_B; + cache.M[A]= x1) + + [ y0 ] [ x1 ] + 2. <-- <-- + + (data2 -> user_A, + cache.M[B]= y1) + + [ y1 ] [ x1 ] + ... (etc.) ... + + + Figure 2. TAO: Three-Way Handshake is Bypassed + + + + +Braden [Page 7] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + Figure 2 shows the simplest case: each side has cached the latest + M value of the other, and the SEG.M value in the client's SYN + segment is greater than the value in the cache at the server host. + As a result, B can accept the client A's request data1 immediately + and pass it to the server application. B's reply data2 is shown + piggybacked on the segment. As a result of this 2-way + exchange, the cached M values are updated at both sites; the + client side becomes relevant only if the client/server roles + reverse. Validation of the segment at host A is + discussed later. + + Figure 3 shows the TAO test failing but the consequent 3-way + handshake succeeding. B updates its cache with the value x2 >= x1 + when the initial SYN is known to be valid. + + + _T_C_P__A _T_C_P__B + + cache.M[B] cache.M[A] + V V + + [ y0 ] [ x0 ] + 1. --> --> ( (x1 <= x0) => + data1 queued; + 3-way handshake) + + [ y0 ] [ x0 ] + 2. <-- <-- + (cache.M[B]= y1) + + [ y1 ] [ x0 ] + 3. --> --> (Handshake OK => + data1->user_B, + cache.M[A]= x2) + + [ y1 ] [ x2 ] + ... (etc.) ... + + Figure 3. TAO Test Fails but 3-Way Handshake Succeeds. + + There are several possible causes for a TAO test failure on a + legitimate new SYN segment (not an old duplicate). + + (1) There may be no cached M value for this particular client + host. + + (2) The SYN may be the one of a set of nearly-simultaneous SYNs + for different connections but from the same host, which + + + +Braden [Page 8] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + arrived out of order. + + (3) The finite M space may have wrapped around between successive + transactions from the same client. + + (4) The M values may advance too slowly for closely-spaced + transactions. + + None of these TAO failures will cause a lockout, because the + resulting 3-way handshake will succeed. Note that the first + transaction between a given host pair will always require a 3-way + handshake; subsequent transactions can take advantage of TAO. + + The per-host cache required by TAO is highly desirable for other + reasons, e.g., to retain the measured round trip time and MTU for + a given remote host. Furthermore, a host should already have a + per-host routing cache [HR-COMM] that should be easily extensible + for this purpose. + + Figure 4 illustrates a complete TCP transaction sequence using the + TAO mechanism. Bypassing the 3-way handshake leads to new + connection states; Figure 4 shows three of them, "SYN-SENT*", + "CLOSE-WAIT*", and "LAST-ACK*". Explanation of these states is + deferred to Section 6. + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + 1. SYN-SENT* --> --> CLOSE-WAIT* + (TAO test OK=> + data1->user_B) + + <-- <-- LAST-ACK* + 2. TIME-WAIT + (data2->user_A) + + + 3. TIME-WAIT --> --> CLOSED + + (timeout) + CLOSED + + + Figure 4: Minimal Transaction Sequence Using TAO + + + + +Braden [Page 9] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + 3.2 Cache Initialization + + The first connection between hosts A and B will find no cached + state at one or both ends, so both M caches must be initialized. + This requires that the first transaction carry a specially marked + SEG.M value, which we call SEG.M.NEW. Receiving a SEG.M.NEW value + in an initial SYN segment, B will cache this value and send its + own M back to initialize A's cache. When a host crashes and + restarts, all its cached M values cache.M[*] must be invalidated + in order to force a re-synchronization of the caches at both ends. + + This cache synchronization procedure is illustrated in Figure 5, + where client host A has crashed and restarted with its cache + entries undefined, as indicated by "??". Since cache.TS[B] is + undefined, A sends a SEG.M.NEW value instead of SEG.M in the + segment of its first transaction request to B. Receiving this + SEG.M.NEW, the server host B invalidates cache.TS[A] and performs + a 3-way handshake. SEG.M in segment #2 updates A's cache, and + when the handshake completes successfully, B updates its cached M + value to x2 >= x1. + + + _T_C_P__A _T_C_P__B + + cache.M[B] cache.M[A] + V V + [ ?? ] [ x0 ] + + 1. --> --> (invalidate cache; + queue data1; + [ ?? ] 3-way handshake) + + [ ?? ] + 2. <-- <-- + (cache.M[B]= y1) + + [ y1 ] [ ?? ] + + 3. --> --> data1->user_B, + cache.M[A]= x2) + + [ y1 ] [ x2 ] + ... (etc.) ... + + Figure 5. Client Host Crashed + + + Suppose that the 3-way handshake failed, presumably because + + + +Braden [Page 10] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + segment #1 was an old duplicate. Then segment #3 from host A + would be an RST segment, with the result that both side's caches + would be left undefined. + + Figure 6 shows the procedure when the server crashes and restarts. + Upon receiving a segment from a host for which it has no + cached M value, B initiates a 3-way handshake to validate the + request and sends its own M value to A. Again the result is to + update cached M values on both sides. + + + _T_C_P__A _T_C_P__B + + cache.M[B] cache.M[A] + V V + [ y0 ] [ ?? ] + + 1. --> --> (data1 queued; + 3-way handshake) + + [ y0 ] [ ?? ] + 2. <-- <-- + (cache.M[B]= y1) + + [ y1 ] [ ?? ] + 3. --> --> (data1->user_B, + cache.M[A]= x2) + + [ y1 ] [ x2 ] + ... (etc.) ... + + + Figure 6. Server Host Crashed + + + 3.3 Accepting Segments + + Transactions introduce a new hazard of erroneously accepting an + old duplicate segment. To be acceptable, a + segment must arrive in SYN-SENT state, and its ACK field must + acknowledge something that was sent. In current TCPs the + effective send window in SYN-SENT state is exactly one octet, and + an acceptable must exactly ACK this one octet. The + clock-driven selection of Initial Sequence Number (ISN) makes an + erroneous acceptance exceedingly unlikely. An old duplicate SYN + could be accepted erroneously only if successive connection + attempts occurred more often than once every 4 microseconds, or if + the segment lifetime exceeded the 4 hour wraparound time for ISN + + + +Braden [Page 11] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + selection. + + However, when TCP is used for transactions, data sent with the + initial SYN increases the range of sequence numbers that have been + sent. This increases the danger of accepting an old duplicate + segment, and the consequences are more serious. In the + example in Figure 7, segments 1-3 form a normal transaction + sequence, and segment 4 begins a new transaction (incarnation) for + the same connection. Segment #5 is a duplicate of segment #2 from + the preceding transaction. Although the new transaction has a + larger ISN, the previous ACK value 402 falls into the new range + [200,700) of sequence numbers that have been sent, so segment #5 + could be erroneously accepted and passed to the client as the + response to the new request. + + _T_C_P__A _T_C_P__B + + CLOSED LISTEN + + 1. --> --> (TAO test OK) + + + 2. <-- <-- + + + 3. TIME-WAIT --> --> CLOSED + (short timeout) + CLOSED + + (New Request) + 4. --> --> ... + + (Duplicate of segment #2) + 5. <-- <--... + (Acceptable!!) + + + Figure 7: Old Duplicate Causing Error + + + Unfortunately, we cannot simply use TAO on the client side to + detect and reject old duplicate segments. A TAO test at + the client might fail for a valid segment, due to out- + of-order delivery, and this could result in permanent non-delivery + of a valid transaction reply. + + Instead, we include a second M value, an echo of the client's M + value from the initial segment, in the segment. A + + + +Braden [Page 12] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + specially-marked M value, SEG.M.ECHO, is used for this purpose. + The client knows the value it sent in the initial and can + therefore positively validate the using the echoed + value. This is illustrated in Figure 12, which is the same as + Figure 4 with the addition of the echoed value on the + segment #2. + + It should be noted that TCP allows a simultaneous open sequence in + which both sides send and receive an initial (see Figure 8 + of [STD-007]. In this case, the TAO test must be performed on + both sides to preserve the symmetry. See [TTCP-FS] for an + example. + +4. SHORTENING TIME-WAIT STATE + + Once a transaction has been initiated for a particular connection + (pair of ports) between a given host pair, a new transaction for the + same connection cannot take place for a time that is at least: + + RTT + SPT + TIME-WAIT_delay + + Since the client host can cycle among the 64512 available port + numbers, an upper bound on the transaction rate between a particular + host pair is: + + [1] TRmax = 64512 /(RTT + TIME-WAIT_Delay) + + in transactions per second (Tps), where we assumed SPT is negligible. + We must reduce TIME-WAIT_Delay to support high-rate TCP transaction + processing. + + TIME-WAIT state performs two functions: (1) supporting the full- + duplex reliable close of TCP, and (2) allowing old duplicate segments + from an earlier connection incarnation to expire before they can + cause an error (see Appendix to [RFC-1185]). The first function + impacts the application model of a TCP connection, which we would not + want to change. The second is part of the fundamental machinery of + TCP reliable delivery; to safely truncate TIME-WAIT state, we must + provide another means to exclude duplicate packets from earlier + incarnations of the connection. + + To minimize the delay in TIME-WAIT state while performing both + functions, we propose to set the TIME-WAIT delay to: + + [2] TIME-WAIT_Delay = max( K*RTO, U ) + + where U and K are constants and RTO is the dynamically-determined + retransmission timeout, the measured RTT plus an allowance for the + + + +Braden [Page 13] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + RTT variance [Jacobson88]. We choose K large enough so that there is + high probability of the close completing successfully if at all + possible; K = 8 seems reasonable. This takes care of the first + function of TIME-WAIT state. + + In a real implementation, there may be a minimum RTO value Tr, + corresponding to the precision of RTO calculation. For example, in + the popular BSD implementation of TCP, the minimum RTO is Tr = 0.5 + second. Assuming K = 8 and U = 0, Eqns [1] and [2] impose an upper + limit of TRmax = 16K Tps on the transaction rate of these + implementations. + + It is possible to have many short connections only if RTO is very + small, in which case the TIME-WAIT delay [2] reduces to U. To + accelerate the close sequence, we need to reduce U below the MSL + enforced by the IP layer, without introducing a hazard from old + duplicate segments. For this purpose, we introduce another monotonic + number sequence; call it X. X values are required to be monotonic + between successive connection incarnations; depending upon the choice + of the X space (see Section 5), X values may also increase during a + connection. A value from the X space is to be carried in every + segment, and a segment is rejected if it is received with an X value + smaller than the largest X value received. This mechanism does not + use a cache; the largest X value is maintained in the TCP connection + control block (TCB) for each connection. + + The value of U depends upon the choice for the X space, discussed in + the next section. If X is time-like, U can be set to twice the time + granularity (i.e, twice the minimum "tick" time) of X. The TIME-WAIT + delay will then ensure that current X values do not overlap the X + values of earlier incarnations of the same connection. Another + consequence of time-like X values is the possibility that an open but + idle connection might allow the X value to wrap its sign bit, + resulting in a lockup of the connection. To prevent this, a 24-day + idle timer on each open connection could bypass the X check on the + first segment following the idle period, for example. In practice, + many implementations have keep-alive mechanisms that prevent such + long idle periods [RFC-1323]. + + Referring back to Figure 4, our proposed transaction extension + results in a minimum exchange of 3 packets. Segment #3, the final + ACK segment, does not increase transaction latency, but in + combination with the TIME-WAIT delay of K*RTO it ensures that the + server side of the connection will be closed before a new transaction + is issued for this same pair of ports. It also provides an RTT + measurement for the server. + + We may ask whether it would be possible to further reduce the TIME- + + + +Braden [Page 14] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + WAIT delay. We might set K to zero; alternatively, we might allow + the client TCP to start a new transaction request while the + connection was still in TIME-WAIT state, with the new initial SYN + acting as an implied acknowledgment of the previous FIN. Appendix A + summarizes the issues raised by these alternatives, which we call + "truncating" TIME-WAIT state, and suggests some possible solutions. + Further study would be required, but these solutions appear to bend + the theory and/or implementations of the TCP protocol farther than we + wish to bend them. + + We therefore propose using formula [2] with K=8 and retaining the + final ACK(FIN) transmission. To raise the transaction rate, + therefore, we require small values of RTO and U. + +5. CHOOSING A MONOTONIC SEQUENCE + + For simplicity, we want the monotonic sequence X used for shortening + TIME-WAIT state to be identical to the monotonic sequence M for + bypassing the 3-way handshake. Calling the common space M, we will + send an M value SEG.M in each TCP segment. Upon receipt of an + initial SYN segment, SEG.M will be compared with a per-host cached + value to authenticate the SYN without a 3-way handshake; this is the + TAO mechanism. Upon receipt of a non-SYN segment, SEG.M will be + compared with the current value in the connection control block and + used to discard old duplicates. + + Note that the situation with TIME-WAIT state differs from that of + bypassing 3-way handshakes in two ways: (a) TIME-WAIT requires + duplicate detection on every segment vs. only on SYN segments, and + (b) TIME-WAIT applies to a single connection vs. being global across + all connections. This section discusses possible choices for the + common monotonic sequence. + + The SEG.M values must satisfy the following requirements. + + * The values must be monotonic; this requirement is defined more + precisely below. + + * Their granularity must be fine-grained enough to support a high + rate of transaction processing; the M clock must "tick" at least + once between successive transactions. + + * Their range (wrap-around time) must be great enough to allow a + realistic MSL to be enforced by the network. + + The TCP spec calls for an MSL of 120 secs. Since much of the + Internet does not carefully enforce this limit, it would be safer to + have an MSL at least an order of magnitude larger. We set as an + + + +Braden [Page 15] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + objective an MSL of at least 2000 seconds. If there were no TIME- + WAIT delay, the ultimate limit on transaction rate would be set by + speed-of-light delays in the network and by the latency of host + operating systems. As the bottleneck problems with interfacing CPUs + to gigabit LANs are solved, we can imagine transaction durations as + short as 1 microsecond. Therefore, we set an ultimate performance + goal of TRmax at least 10**6 Tps. + + A particular connection between hosts A and B is identified by the + local and remote TCP "sockets", i.e., by the quadruplet: {A, B, + Port.A, Port.B}. Imagine that each host keeps a count CC of the + number of TCP connections it has initiated. We can use this CC + number to distinguish different incarnations of the same connection. + Then a particular SEG.M value may be labeled implicitly by 6 + quantities: {A, B, Port.A, Port.B, CC, n}, where n is the byte offset + of that segment within the connection incarnation. + + To bypass the 3-way handshake, we require thgt SEG.M values on + successive SYN segments from a host A to a host B be monotone + increasing. If CC' > CC, then we require that: + + SEG.M(A,B,Port.A,Port.B,CC',0) > SEG.M(A,B,Port.A,Port.B,CC,0) + + for any legal values of Port.A and Port.B. + + To delete old duplicates (allowing TIME-WAIT state to be shortened), + we require that SEG.M values be disjoint across different + incarnations of the same connection. If CC' > CC then + + SEG.M(A,B,Port.A,Port.B,CC',n') > SEG.M(A,B,Port.A,Port.B,CC,n), + + for any non-negative integers n and n'. + + We now consider four different choices for the common monotonic + space: RFC-1323 timestamps, TCP sequence numbers, the connection + count, and 64-bit TCP sequence numbers. The results are summarized + in Table I. + + 5.1 Cached Timestamps + + The PAWS mechanism [RFC-1323] uses TCP "timestamps" as + monotonically increasing integers in order to throw out old + duplicate segments within the same incarnation. Jacobson + suggested the cacheing of these timestamps for bypassing 3-way + handshakes [Jacobson90], i.e., that TCP timestamps be used for our + common monotonic space M. This idea is attractive since it would + allow the same timestamp options to be used for RTTM, PAWS, and + transactions. + + + +Braden [Page 16] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + To obtain at-most-once service, the criterion for immediate + acceptance of a SYN must be that SEG.M is strictly greater than + the cached M value. That is, to be useful for bypassing 3-way + handshakes, the timestamp clock must tick at least once between + any two successive transactions between the same pair of hosts + (even if different ports are used). Hence, the timestamp clock + rate would determine TRmax, the maximum possible transaction rate. + + Unfortunately, the timestamp clock frequency called for by RFC- + 1323, in the range 1 sec to 1 ms, is much too slow for + transactions. The TCP timestamp period was chosen to be + comparable to the fundamental interval for computing and + scheduling retransmission timeouts; this is generally in the range + of 1 sec. to 1 ms., and in many operating systems, much closer to + 1 second. Although it would be possible to increase the timestamp + clock frequency by several orders of magnitude, to do so would + make implementation more difficult, and on some systems + excessively expensive. + + The wraparound time for TCP timestamps, at least 24 days, causes + no problem for transactions. + + The PAWS mechanism uses TCP timestamps to protect against old + duplicate non-SYN segments from the same incarnation [RFC-1323]. + It can also be used to protect against old duplicate data segments + from earlier incarnations (and therefore allow shortening of + TIME-WAIT state) if we can ensure that the timestamp clock ticks + at least once between the end of one incarnation and the beginning + of the next. This can be achieved by setting U = 2 seconds, i.e., + to twice the maximum timestamp clock period. This value in + formula [2] leads to an upper bound TRmax = 32K Tps between a host + pair. However, as pointed out above, old duplicate SYN detection + using timestamps leads to a smaller transaction rate bound, 1 Tps, + which is unacceptable. In addition, the timestamp approach is + imperfect; it allows old ACK segments to enter the new connection + where they can cause a disconnect. This happens because old + duplicate ACKs that arrive during TIME-WAIT state generate new + ACKs with the current timestamp [RFC-1337]. + + We therefore conclude that timestamps are not adequate as the + monotonic space M; see Table I. However, they may still be useful + to effectively extend some other monotonic number space, just as + they are used in PAWS to extend the TCP sequence number space. + This is discussed below. + + + + + + + +Braden [Page 17] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + 5.2 Current TCP Sequence Numbers + + It is useful to understand why the existing 32-bit TCP sequence + numbers do not form an appropriate monotonic space for + transactions. + + The sequence number sent in an initial SYN is called the Initial + Sequence Number or ISN. According to the TCP specification, an + ISN is to be selected using: + + [3] ISN = (R*T) mod 2**32 + + where T is the real time in seconds (from an arbitrary origin, + fixed when the system is started) and R is a constant, currently + 250 KBps. These ISN values form a monotonic time sequence that + wraps in 4.55 hours = 16380 seconds and has a granularity of 4 + usecs. For transaction rates up to roughly 250K Tps, the ISN + value calculated by formula [3] will be monotonic and could be + used for bypassing the 3-way handshake. + + However, TCP sequence numbers (alone) could not be used to shorten + TIME-WAIT state, because there are several ways that overlap of + the sequence space of successive incarnations can occur (as + described in Appendix to [RFC-1185]). One way is a "fast + connection", with a transfer rate greater than R; another is a + "long" connection, with a duration of approximately 4.55 hours. + TIME-WAIT delay is necessary to protect against these cases. With + the official delay of 240 seconds, formula [1] implies a upper + bound (as RTT -> 0) of TRmax = 268 Tps; with our target MSL of + 2000 sec, TRmax = 32 Tps. These values are unacceptably low. + + To improve this transaction rate, we could use TCP timestamps to + effectively extend the range of the TCP sequence numbers. + Timestamps would guard against sequence number wrap-around and + thereby allow us to increase R in [3] to exceed the maximum + possible transfer rate. Then sequence numbers for successive + incarnations could not overlap. Timestamps would also provide + safety with an MSL as large as 24 days. We could then set U = 0 + in the TIME-WAIT delay calculation [2]. For example, R = 10**9 + Bps leads to TRmax <= 10**9 Tps. See 2(b) in Table I. These + values would more than satisfy our objectives. + + We should make clear how this proposal, sequence numbers plus + timestamps, differs from the timestamps alone discussed (and + rejected) in the previous section. The difference lies in what is + cached and tested for TAO; the proposal here is to cache and test + BOTH the latest TCP sequence number and the latest TCP timestamp. + In effect, we are proposing to use timestamps to logically extend + + + +Braden [Page 18] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + the sequence space to 64 bits. Another alternative, presented in + the next section, is to directly expand the TCP sequence space to + 64 bits. + + Unfortunately, the proposed solution (TCP sequence numbers plus + timestamps) based on equation [3] would be difficult or impossible + to implement on many systems, which base their TCP implementation + upon a very low granularity software clock, typically O(1 sec). + To adapt the procedure to a system with a low granularity software + clock, suppose that we calculate the ISN as: + + [4] ISN = ( R*Ts*floor(T/Ts) + q*CC) mod 2**32 + + where Ts is the time per tick of the software clock, CC is the + connection count, and q is a constant. That is, the ISN is + incremented by the constant R*Ts once every clock tick and by the + constant q for every new connection. We need to choose q to + obtain the required monotonicity. + + For monotonicity of the ISN's themselves, q=1 suffices. However, + monotonicity during the entire connection requires q = R*Ts. This + value of q can be deduced as follows. Let S(T, CC, n) be the + sequence number for byte offset n in a connection with number CC + at time T: + + S(T, CC, n) = (R*Ts*floor(T/Ts) + q*CC + n) mod 2**32. + + For any T1 > T2, we require that: S(T2, CC+1, 0) - S(T1, CC, n) > + 0 for all n. Since R is assumed to be an upper bound on the + transfer rate, we can write down: + + R > n/(T2 - T1), or T2/Ts - T1/Ts > n/(R*Ts) + + Using the relationship: floor(x)-floor(y) > x-y-1 and a little + algebra leads to the conclusion that using q = R*Ts creates the + required monotonic number sequence. Therefore, we consider: + + [5] ISN = R*Ts*(floor(T/Ts) + CC) mod 2**32 + + (which is the algorithm used for ISN selection by BSD TCP). + + For error-free operation, the sequence numbers generated by [5] + must not wrap the sign bit in less than MSL seconds. Since CC + cannot increase faster than TRmax, the safe condition is: + + R* (1 + Ts*TRmax) * MSL < 2**31. + + We are interested in the case: Ts*TRmax >> 1, so this relationship + + + +Braden [Page 19] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + reduces to: + + [6] R * Ts * TRmax * MSL < 2**31. + + This shows a direct trade-off among the maximum effective + bandwidth R, the maximum transaction rate TRmax, and the maximum + segment lifetime MSL. For reasonable limiting values of R, Ts, + and MSL, formula [6] leads to a very low value of TRmax. For + example, with MSL= 2000 secs, R=10**9 Bps, and Ts = 0.5 sec, TRmax + < 2*10**-3 Tps. + + To ease the situation, we could supplement sequence numbers with + timestamps. This would allow an effective MSL of 2 seconds in + [6], since longer times would be protected by differing + timestamps. Then TRmax < 2**30/(R*Ts). The actual enforced MSL + would be increased to 24 days. Unfortunately, TRmax would still + be too small, since we want to support transfer rates up to R ~ + 10**9 Bps. Ts = 0.5 sec would imply TRmax ~ 2 Tps. On many + systems, it appears infeasible to decrease Ts enough to obtain an + acceptable TRmax using this approach. + + 5.3 64-bit TCP Sequence Numbers + + Another possibility would be to simply increase the TCP sequence + space to 64 bits as suggested in [RFC-1263]. We would also + increase the R value for clock-driven ISN selection, beyond the + fastest transfer rate of which the host is capable. A reasonable + upper limit might be R = 10**9 Bps. As noted above, in a + practical implementation we would use: + + ISN = R*Ts*( floor(T/Ts) + CC) mod 2**64 + + leading to: + + R*(1 + Ts * TRmax) * MSL < 2**63 + + For example, suppose that R = 10**9 Bps, Ts = 0.5, and MSL = 16K + secs (4.4 hrs); then this result implies that TRmax < 10**6 Tps. + We see that adding 32 bits to the sequence space has provided + feasible values for transaction processing. + + 5.4 Connection Counts + + The Connection Count CC is well suited to be the monotonic + sequence M, since it "ticks" exactly once for each new connection + incarnation and is constant within a single incarnation. Thus, it + perfectly separates segments from different incarnations of the + same connection and would allow U = 0 in the TIME-WAIT state delay + + + +Braden [Page 20] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + formula [2]. (Strictly, U cannot be reduced below 1/R = 4 usec, + as noted in Section 4. However, this is of little practical + consequence until the ultimate limits on TRmax are approached). + + Assume that CC is a 32-bit number. To prevent wrap-around in the + sign bit of CC in less than MSL seconds requires that: + + TRmax * MSL < 2**31 + + For example, if MSL = 2000 seconds then TRmax < 10**6 Tp. These + are acceptable limits for transaction processing. However, if + they are not, we could augment CC with TCP timestamps to obtain + very far-out limits, as discussed below. + + It would be an implementation choice at the client whether CC is + global for all destinations or private to each destination host + (and maintained in the per-host cache). In the latter case, the + last CC value assigned for each remote host could also be + maintained in the per-host cache. Since there is not typically a + large amount of parallelism in the network connection of a host, + there should be little difference in the performance of these two + different approaches, and the single global CC value is certainly + simpler. + + To augment CC with TCP timestamps, we would bypass a 3-way + handshake if both SEG.CC > cache.CC[A] and SEG.TSval >= + cache.TS[A]. The timestamp check would detect a SYN older than 2 + seconds, so that the effective wrap-around requirement would be: + + TRmax * 2 < 2**31 + + i.e., TRmax < 10**9 Tps. The required MSL would be raised to 24 + days. Using timestamps in this way, we could reduce the size of + CC. For example, suppose CC were 16 bits. Then the wrap-around + condition TRmax * 2 < 2**15 implies that TRmax is 16K. + + Finally, note that using CC to delete old duplicates from earlier + incarnations would not obviate the need for the time-stamp-based + PAWS mechanism to prevent errors within a single incarnation due + to wrapping the 32-bit TCP sequence space at very high transfer + rates. + + 5.5 Conclusions + + The alternatives for monotonic sequence are summarized in Table I. + We see that there are two feasible choices for the monotonic + space: the connection count and 64-bit sequence numbers. Of these + two, we believe that the simpler is the connection count. + + + +Braden [Page 21] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + Implementation of 64-bit sequence numbers would require + negotiation of a new header format and expansion of all variables + and calculations on the sequence space. CC can be carried in an + option and need be examined only once per packet. + + We propose to use a simple 32-bit connection count CC, without + augmentation with timestamps, for the transaction extension. This + choice has the advantages of simplicity and directness. Its + drawback is that it adds a third sequence-like space (in addition + to the TCP sequence number and the TCP timestamp) to each TCP + header and to the main line of packet processing. However, the + additional code is in fact very modest. + + We now have a general outline of the proposed TCP extensions for + transactions. + + o A host maintains a 32-bit global connection counter variable CC. + + o The sender's current CC value is carried in an option in every + TCP segment. + + o CC values are cached per host, and the TAO mechanism is used to + bypass the 3-way handshake when possible. + + o In non-SYN segments, the CC value is used to reject duplicates + from earlier incarnations. This allows TIME-WAIT state delay to + be reduced to K*RTO (i.e., U=0 in Eq. [2]). + + + + + + + + + + + + + + + + + + + + + + + + +Braden [Page 22] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + TABLE I: Summary of Monotonic Sequences + + APPROACH TRmax (Tps) Required MSL COMMENTS + __________________________________________________________________ + + 1. Timestamp & PAWS 1 24 days TRmax is + too small + __________________________________________________________________ + + 2. Current TCP Sequence Numbers + + (a) clock-driven + ISN: eq. [3] 268 240 secs TRmax & MSL + too small + + (b) Timestamps& clock- + driven ISN [3] & 10**9 24 days Hard to + R=10**9 implement + + (c) Timestamps & c-dr + ISN: eq. [4] 2**30/(R*Ts) 24 days TRmax too + small. + __________________________________________________________________ + + 3. 64-bit TCP Sequence Numbers + + 2**63/(MSL*R*Ts) MSL Significant + TCP change + e.g., R=10**9 Bps, + MSL = 4.4 hrs, + Ts = 0.5 sec=> + TRmax = 10**6 + __________________________________________________________________ + + 4. Connection Counts + + (a) no timestamps 2**31/MSL MSL 3rd sequence + e.g., MSL=2000 sec space + TRmax = 10**6 + + (b) with timestamps 2**30 24 days (ditto) + and PAWS + __________________________________________________________________ + + + + + + + + +Braden [Page 23] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + +6. CONNECTION STATES + + TCP has always allowed a connection to be half-closed. TAO makes a + significant addition to TCP semantics by allowing a connection to be + half-synchronized, i.e., to be open for data transfer in one + direction before the other direction has been opened. Thus, the + passive end of a connection (which receives an initial SYN) can + accept data and even a FIN bit before its own SYN has been + acknowledged. This SYN, data, and FIN may arrive on a single segment + (as in Figure 4), or on multiple segments; packetization makes no + difference to the logic of the finite-state machine (FSM) defining + transitions among connection states. + + Half-synchronized connections have several consequences. + + (a) The passive end must provide an implied initial data window in + order to accept data. The minimum size of this implied window + is a parameter in the specification; we suggest 4K bytes. + + (b) New connection states and transitions are introduced into the + TCP FSM at both ends of the connection. At the active end, new + states are required to piggy-back the FIN on the initial SYN + segment. At the passive end, new states are required for a + half-synchronized connection. + + This section develops the resulting FSM description of a TCP + connection as a conventional state/transition diagram. To develop a + complete FSM, we take a constructive approach, as follows: (1) write + down all possible events; (2) write down the precedence rules that + govern the order in which events may occur; (3) construct the + resulting FSM; and (4) augment it to support TAO. In principle, we + do this separately for the active and passive ends; however, the + symmetry of TCP results in the two FSMs being almost entirely + coincident. + + Figure 8 lists all possible state transitions for a TCP connection in + the absence of TAO, as elementary events and corresponding actions. + Each transition is labeled with a letter. Transitions a-g are used + by the active side, and c-i are used by the passive side. Without + TAO, transition "c" (event "rcv ACK(SYN)") synchronizes the + connection, allowing data to be accepted for the user. + + By definition, the first transition for an active (or passive) side + must be "a" (or "i", respectively). During a single instance of a + connection, the active side will progress through some permutation of + the complete sequence of transitions {a b c d e f } or the sequence + {a b c d e f g}. The set of possible permutations is determined by + precedence rules governing the order in which transitions can occur. + + + +Braden [Page 24] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + Label Event / Action + _____ ________________________ + a OPEN / snd SYN + + b rcv SYN [No TAO]/ snd ACK(SYN) + + c rcv ACK(SYN) / + + d CLOSE / snd FIN + + e rcv FIN / snd ACK(FIN) + + f rcv ACK(FIN) / + + g timeout=2MSL / delete TCB + ___________________________________________________ + h passive OPEN / create TCB + + i rcv SYN [No TAO]/ snd SYN, ACK(SYN) + ___________________________________________________ + + Figure 8. Basic TCP Connection Transitions + + + Using the notation "<." to mean "must precede", the precedence rules + are: + + (1) Logical ordering: must open connection before closing it: + + b <. e + + (2) Causality -- cannot receive ACK(x) before x has been sent: + + a <. c and i <. c and d <. f + + (3) Acknowledgments are cumulative + + c <. f + + (4) First packet in each direction must contain a SYN. + + b <. c and b <. f + + (5) TIME-WAIT state + + Whenever d precedes e in the sequence, g must be the last + transition. + + + + +Braden [Page 25] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + Applying these rules, we can enumerate all possible permutations of + the events and summarize them in a state transition diagram. Figure + 9 shows the result, with boxes representing the states and directed + arcs representing the transitions. + + ________ ________ + | | h | | + | CLOSED |--------->| LISTEN | + |________| |________| + | | + | a | i + ____V____ ____V___ ________ + | | b | | e | | + | |--------->| |-------------->| | + |________| |________| |________| + / / | / | + / / | c d / | c + / / __V_____ | ____V___ + / / | | e | | | + d | d / | |------------>| | + | | |________| | |________| + | | | | | + | | | ___V____ | + | | | | | | + | | | | | | + | | | |________| | + | | | | | + ____V___ ______V_ | ________ | | + | | b | | e | | | | | + | |------->| |--------->| | | | + |________| |________| | |________| | | + | / | | | + c | / d c | c | d | + | / | | | + _V___V__ ____V___ V_____V_ + | | e | | | | + | |---->| | | | + |________| |________| |________| + | | | + | f | f | f + ____V___ ____V___ ___V____ + | | e | TIME- | g | | + | |---->| WAIT |-->| CLOSED | + |________| |________| |________| + + + Figure 9: Basic State Diagram + + + + +Braden [Page 26] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + Although Figure 9 gives a correct representation of the possible + event sequences, it is not quite correct for the actions, which do + not compose as shown. In particular, once a control bit X has been + sent, it must continue to be sent until ACK(X) is received. This + requires new transitions with modified actions, shown in the + following list. We use the labeling convention that transitions with + the same event part all have the same letter, with different numbers + of primes to indicate different actions. + + Label Event / Action + _____ _______________________________________ + b' (=i) rcv SYN [No TAO] / snd SYN,ACK(SYN) + b'' rcv SYN [No TAO] / snd SYN,FIN,ACK(SYN) + d' CLOSE / snd SYN,FIN + e' rcv FIN / snd FIN,ACK(FIN) + e'' rcv FIN / snd SYN,FIN,ACK(FIN) + + + Figure 10 shows the state diagram of Figure 9, with the modified + transitions and with the states used by standard TCP [STD-007] + identified. Those states that do not occur in standard TCP are + numbered 1-5. + + Standard TCP has another implied restriction: a FIN bit cannot be + recognized before the connection has been synchronized, i.e., c <. e. + This eliminates from standard TCP the states 1, 2, and 5 shown in + Figure 10. States 3 and 4 are needed if a FIN is to be piggy-backed + on a SYN segment (note that the states shown in Figure 1 are actually + wrong; the states shown as SYN-SENT and ESTABLISHED are really states + 3 and 4). In the absence of piggybacking the FIN bit, Figure 10 + reduces to the standard TCP state diagram [STD-007]. + + The FSM described in Figure 10 is intended to be applied + cumulatively; that is, parsing a single packet header may lead to + more than one transition. For example, the standard TCP state + diagram includes a direct transition from SYN-SENT to ESTABLISHED: + + rcv SYN,ACK(SYN) / snd ACK(SYN). + + This is transition b followed immediately by c. + + + + + + + + + + + +Braden [Page 27] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + ________ ________ + | | h | | + | CLOSED |--------->| LISTEN | + |________| |________| + | | + | a | i + ____V____ ____V___ ________ + | SYN- | b' | SYN- | e' | | + | SENT |--------->|RECEIVED|-------------->| 1 | + |________| |________| |________| + / / | | | + d'/ d'/ | c d' | c | + / / __V_____ | _V______ + / / |ESTAB- | e | | CLOSE- | + | / | LISHED|------------|-->| WAIT | + | | |________| | |________| + | | | | | + | | | _____V__ | + | | | | | | + | | | | 2 | | + | | | |________| | + | | | | | + ____V___ ______V_ | ________ | | + | | b'' | |e''' | | | | | + | 3 |------->| 4 |--------->| 5 | | | + |________| |________| | |________| | | + | / | | | + c | / d c | c | d | + | / | | | + _V___V__ ____V___ V_____V_ + | FIN- | e'' | | | LAST- | + | WAIT-1|---->|CLOSING | | ACK | + |________| |________| |________| + | | | + | f | f | f + ____V___ ____V___ ___V____ + | FIN- | e | TIME- | g | | + | WAIT-2|---->| WAIT |-->| CLOSED | + |________| |________| |________| + + + Figure 10: Basic State Diagram -- Correct Actions + + + Next we introduce TAO. If the TAO test succeeds, the connection + becomes half-synchronized. This requires a new set of states, + mirroring the states of Figure 10, beginning with acceptance of a SYN + (transition "b" or "i"), and ending when ACK(SYN) arrives (transition + + + +Braden [Page 28] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + "c"). Figure 11 shows the result of augmenting Figure 10 with the + additional states for TAO. The transitions are defined in the + following table: + + Key for Figure 11: Complete State Diagram with TAO + + + Label Event / Action + _____ ________________________ + + a OPEN / create TCB, snd SYN + b' rcv SYN [no TAO]/ snd SYN,ACK(SYN) + b'' rcv SYN [no TAO]/ snd SYN,FIN,ACK(SYN) + c rcv ACK(SYN) / + d CLOSE / snd FIN + d' CLOSE / snd SYN,FIN + e rcv FIN / snd ACK(FIN) + e' rcv FIN / snd SYN,ACK(FIN) + e'' rcv FIN / snd FIN,ACK(FIN) + e''' rcv FIN / snd SYN,FIN,ACK(FIN) + f rcv ACK(FIN) / + g timeout=2MSL / delete TCB + h passive OPEN / create TCB + i (= b') rcv SYN [no TAO]/ snd SYN,ACK(SYN) + j rcv SYN [TAO OK] / snd SYN,ACK(SYN) + k rcv SYN [TAO OK] / snd SYN,FIN,ACK(SYN) + + + + Each new state in Figure 11 bears a very simple relationship to a + standard TCP state. We indicate this by naming the new state with + the standard state name followed by a star. States SYN-SENT* and + SYN-RECEIVED* differ from the corresponding unstarred states in + recording the fact that a FIN has been sent. The other new states + with starred names differ from the corresponding unstarred states in + being half-synchronized (hence, a SYN bit needs to be transmitted). + + The state diagram of Figure 11 is more general than required for + transaction processing. In particular, it handles simultaneous + connection synchronization from both sides, allowing one or both + sides to bypass the 3-way handshake. It includes other transitions + that are unlikely in normal transaction processing, for example, the + server sending a FIN before it receives a FIN from the client + (ESTABLISHED* -> FIN-WAIT-1* in Figure 11). + + + + + + + +Braden [Page 29] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + ________ ________ + | | h | | + | CLOSED |--------------->| LISTEN | + |________| |________| + | / | + a| / i | j + | / | + | / _V______ ________ + | j | |ESTAB- | e' | CLOSE- | + | /---------|----->| LISHED*|------------>| WAIT*| + | / | |________| |________| + | / | | | | | + | / | |d' | c d' | | c + ____V___ / ______V_ | _V______ | _V______ + | SYN- | b' | SYN- | c | |ESTAB- | e | | CLOSE- | + | SENT |------>|RECEIVED|-----|-->| LISHED|----------|->| WAIT | + |________| |________| | |________| | |________| + | | | | | | + | | | | ___V____ | + | | | | | LAST- | | + | d' | d' | d' | d | ACK* | | + | | | | |________| | + | | | | | | + | | ______V_ | ________ |c |d + | k | | FIN- | | e''' | | | | + | /------|-->| WAIT-1*|---|------>|CLOSING*| | | + | / | |________| | |________| | | + | / | | | | | | + | / | | c | | c | | + ____V___ / ____V___ V_____V_ ____V___ V____V__ + | SYN- | b'' | SYN- | c | FIN- | e'' | | | LAST- | + | SENT* |----->|RECEIVD*|---->| WAIT-1 |---->|CLOSING | | ACK | + |________| |________| |________| |________| |________| + | | | + | f | f | f + ___V____ ____V___ ___V____ + | FIN- | e |TIME- | g | | + | WAIT-2 |---->| WAIT |-->| CLOSED | + |________| |________| |________| + + Figure 11: Complete State Diagram with TAO + + + + The relationship between starred and unstarred states is very + regular. As a result, the state extensions can be implemented very + simply using the standard TCP FSM with the addition of two "hidden" + boolean flags, as described in the functional specification memo + + + +Braden [Page 30] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + [TTCP-FS]. + + As an example of the application of Figure 11, consider the minimal + transaction shown in Figure 12. + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + 1. SYN-SENT* --> --> CLOSE-WAIT* + (TAO test OK=> + data1->user_B) + + LAST-ACK* + <-- <-- + 2. TIME-WAIT + (TAO test OK, + data2->user_A) + + + 3. TIME-WAIT --> --> CLOSED + + (timeout) + CLOSED + + + Figure 12: Minimal Transaction Sequence + + Sending segment #1 leaves the client end in SYN-SENT* state, which + differs from SYN-SENT state in recording the fact that a FIN has been + sent. At the server end, passing the TAO test enters ESTABLISHED* + state, which passes the data to the user as in ESTABLISHED state and + also records the fact that the connection is half synchronized. Then + the server processes the FIN bit of segment #1, moving to CLOSE-WAIT* + state. + + Moving to CLOSE-WAIT* state should cause the server to send a segment + containing SYN and ACK(FIN). However, transmission of this segment + is deferred so the server can piggyback the response data and FIN on + the same segment, unless a timeout occurs first. When the server + does send segment #2 containing the response data2 and a FIN, the + connection advances from CLOSE-WAIT* to LAST-ACK* state; the + connection is still half-synchronized from B's viewpoint. + + Processing segment #2 at the client again results in multiple + transitions: + + + +Braden [Page 31] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + SYN-SENT* -> FIN-WAIT-1* -> CLOSING* -> CLOSING -> TIME-WAIT + + These correspond respectively to receiving a SYN, a FIN, an ACK for + A's SYN, and an ACK for A's FIN. + + Figure 13 shows a slightly more complex example, a transaction + sequence in which request and response data each require two + segments. This figure assumes that both client and server TCP are + well-behaved, so that e.g., the client sends the single segment #5 to + acknowledge both data segments #3 and #4. SEG.CC values are omitted + for clarity. + + + _T_C_P__A _T_C_P__B + + + 1. SYN-SENT* --> --> ESTABLISHED* + (TAO OK, + data1-> user) + + 2. SYN-SENT* --> --> CLOSE-WAIT* + (data2-> user) + + 3. FIN-WAIT-2 <-- <-- CLOSE-WAIT* + (data3->user) + + 4. TIME_WAIT <-- <-- LAST-ACK* + (data4->user) + + 5. TIME-WAIT --> --> CLOSED + + + Figure 13. Multi-Packet Request/Response Transaction + + +7. CONCLUSIONS AND ACKNOWLEDGMENTS + + TCP was designed to be a highly symmetric protocol. This symmetry is + evident in the piggy-backing of acknowledgments on data and in the + common header format for data segments and acknowledgments. On the + other hand, the examples and discussion in this memo are in general + highly unsymmetrical; the actions of a "client" are clearly + distinguished from those of a "server". To explain this apparent + discrepancy, we note the following. Even when TCP is used for + virtual circuit service, the data transfer phase is symmetrical but + the open and close phases are not. A minimal transaction, consisting + of one segment in each direction, compresses the open, data transfer, + and close phases together, and making the asymmetry of the open and + + + +Braden [Page 32] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + close phases dominant. As request and response messages increase in + size, the virtual circuit model becomes increasingly relevant, and + symmetry again dominates. + + TCP's 3-way handshake precludes any performance gain from including + data on a SYN segment, while TCP's full-duplex data-conserving close + sequence ties up communication resources to the detriment of high- + speed transactions. Merely loading more control bits onto TCP data + segments does not provide efficient transaction service. To use TCP + as an effective transaction transport protocol requires bypassing the + 3-way handshake and shortening the TIME-WAIT delay. This memo has + proposed a backwards-compatible TCP extension to accomplish both + goals. It is our hope that by building upon the current version of + TCP, we can give a boost to community acceptance of the new + facilities. Furthermore, the resulting protocol implementations will + retain the algorithms that have been developed for flow and + congestion control in TCP [Jacobson88]. + + O'Malley and Peterson have recently recommended against backwards- + compatible extensions to TCP, and suggested instead a mechanism to + allow easy installation of alternative versions of a protocol [RFC- + 1263]. While this is an interesting long-term approach, in the + shorter term we suggest that incremental extension of the current TCP + may be a more effective route. + + Besides the backward-compatible extension proposed here, there are + two other possible approaches to making efficient transaction + processing widely available in the Internet: (1) a new version of TCP + or (2) a new protocol specifically adapted to transactions. Since + current TCP "almost" supports transactions, we favor (1) over (2). A + new version of TCP that retained the semantics of STD-007 but used 64 + bit sequence numbers with the procedures and states described in + Sections 3, 4, and 6 of this memo would support transactions as well + as virtual circuits in a clean, coherent manner. + + A potential application of transaction-mode TCP might be SMTP. If + commands and responses are batched, in favorable cases complete SMTP + delivery operations on short messages could be performed with a + single minimal transaction; on the other hand, the body of a message + may be arbitrarily large. Using a TCP extended as in this memo could + significantly reduce the load on large mail hosts. + + This work began as an elaboration of the concept of TAO, due to Dave + Clark. I am grateful to him and to Van Jacobson, John Wroclawski, + Dave Borman, and other members of the End-to-End Research group for + helpful ideas and critiques during the long development of this work. + I also thank Liming Wei, who tested the initial implementation in Sun + OS. + + + +Braden [Page 33] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + +APPENDIX A -- TIME-WAIT STATE AND THE 2-PACKET EXCHANGE + + This appendix considers the implications of reducing TIME-WAIT state + delay below that given in formula [2]. + + An immediate consequence of this would be the requirement for the + server host to accept an initial SYN for a connection in LAST-ACK + state. Without the transaction extensions, the arrival of a new + in LAST-ACK state looks to TCP like a half-open connection, and + TCP's rules are designed to restore correspondence by destroying the + state (through sending a RST segment) at one end or the other. We + would need to thwart this action in the case of transactions. + + There are two different possible ways to further reduce TIME-WAIT + delay. + + (1) Explicit Truncation of TIME-WAIT state + + TIME-WAIT state could be explicitly truncated by accepting a new + sendto() request for a connection in TIME-WAIT state. + + This would allow the ACK(FIN) segment to be delayed and sent + only if a timeout occurs before a new request arrives. This + allows an ideal 2-segment exchange for closely-spaced + transactions, which would restore some symmetry to the + transaction exchange. However, explicit truncation would + represent a significant change in many implementations. + + It might be supposed that even greater symmetry would result if + the new request segment were a that explicitly + acknowledges the previous reply, rather than a that is + only an implicit acknowledgment. However, the new request + segment might arrive at B to find the server side in either + LAST-ACK or CLOSED state, depending upon whether the ACK(FIN) + had arrived. In CLOSED state, a would not be + acceptable. Hence, if the client sent an initial + instead of a segment, there would be a race condition at + the server. + + (2) No TIME-WAIT delay + + TIME-WAIT delay could be removed entirely. This would imply + that the ACK(FIN) would always be sent (which does not of course + guarantee that it will be received). As a result, the arrival + of a new SYN in LAST-ACK state would be rare. + + This choice is much simpler to implement. Its drawback is that + the server will get a false failure report if the ACK(FIN) is + + + +Braden [Page 34] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + lost. This may not matter in practice, but it does represent a + significant change of TCP semantics. It should be noted that + reliable delivery of the reply is not an issue. The client + enter TIME-WAIT state only after the entire reply, including the + FIN bit, has been received successfully. + + The server host B must be certain that a new request received in + LAST-ACK state is indeed a new SYN and not an old duplicate; + otherwise, B could falsely acknowledge a previous response that has + not in fact been delivered to A. If the TAO comparison succeeds, the + SYN must be new; however, the server has a dilemma if the TAO test + fails. + + In Figure A.1, for example, the reply segment from the first + transaction has been lost; since it has not been acknowledged, it is + still in B's retransmission queue. An old duplicate request, segment + #3, arrives at B and its TAO test fails. B is in the position of + having old state it cannot discard (the retransmission queue) and + needing to build new state to pursue a 3-way handshake to validate + the new SYN. If the 3-way handshake failed, it would need to restore + the earlier LAST-ACK* state. (Compare with Figure 15 "Old Duplicate + SYN Initiates a Reset on Two Passive Sockets" in STD-007). This + would be complex and difficult to accomplish in many implementations. + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + + 1. SYN-SENT* --> --> CLOSE-WAIT* + (TAO test OK; + data1->server) + + 2. (lost) X<-- <-- LAST-ACK* + + (old duplicate) + 3. ... --> LAST-ACK* + (TAO test fail; + 3-way handshake?) + + Figure A.1: The Server's Dilemma + + + The only practical action A can taken when the TAO test fails on a + new SYN received in LAST-ACK state is to ignore the SYN, assuming it + is really an old duplicate. We must pursue the possible consequences + + + +Braden [Page 35] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + of this action. + + Section 3.1 listed four possible reasons for failure of the TAO test + on a legitimate SYN segment: (1) no cached state, (2) out-of-order + delivery of SYNs, (3) wraparound of CCgen relative to the cached + value, or (4) the M values advance too slowly. We are assuming that + there is a cached CC value at B (otherwise, the SYN cannot be + acceptable in LAST-ACK state). Wrapping the CC space is very + unlikely and probably impossible; it is difficult to imagine + circumstances which would allow the new SYN to be delivered but not + the ACK(FIN), especially given the long wraparound time of CCgen. + + This leaves the problem of out-of-order delivery of two nearly- + concurrent SYNs for different ports. The second to be delivered may + have a lower CC option and thus be locked out. This can be solved by + using a new CCgen value for every retransmission of an initial SYN. + + Truncation of TIME-WAIT state and acceptance of a SYN in LAST-ACK + state should take place only if there is a cached CC value for the + remote host. Otherwise, a SYN arriving in LAST-ACK state is to be + processed by normal TCP rules, which will result in a RST segment + from either A or B. + + This discussion leads to a paradigm for rejecting old duplicate + segments that is different from TAO. This alternative scheme is + based upon the following: + + (a) Each retransmission of an initial SYN will have a new value of + CC, as described above. + + This provision takes care of reordered SYNs. + + (b) A host maintains a distinct CCgen value for each remote host. + This value could easily be maintained in the same cache used for + the received CC values, e.g., as cache.CCgen[]. + + Once the caches are primed, it should always be true that + cache.CCgen[B] on host A is equal to cache.CC[A] on host B, and + the next transaction from A will carry a CC value exactly 1 + greater. Thus, there is no problem of wraparound of the CC + value. + + (c) A new SYN is acceptable if its SEG.CC > cache.CC[client], + otherwise the SYN is ignored as an old duplicate. + + This alternative paradigm was not adopted because it would be a + somewhat greater perturbation of TCP rules, because it may not have + the robustness of TAO, and because all of its consequences may not be + + + +Braden [Page 36] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + understood. + + +REFERENCES + + [Birrell84] Birrell, A. and B. Nelson, "Implementing Remote + Procedure Calls", ACM TOCS, Vo. 2, No. 1, February 1984. + + [Clark88] Clark, D., "The Design Philosophy of the Internet + Protocols", ACM SIGCOMM '88, Stanford, CA, August 1988. + + [Clark89] Clark, D., Private communication, 1989. + + [Garlick77] Garlick, L., R. Rom, and J. Postel, "Issues in Reliable + Host-to-Host Protocols", Proc. Second Berkeley Workshop on + Distributed Data Management and Computer Networks, May 1977. + + [HR-COMM] Braden, R., Ed., "Requirements for Internet Hosts -- + Communication Layers", STD-003, RFC-1122, October 1989. + + [Jacobson88] Jacobson, V., "Congestion Avoidance and Control", + SIGCOMM '88, Stanford, CA., August 1988. + + [Jacobson90] Jacobson, V., private communication, 1990. + + [Liskov90] Liskov, B., Shrira, L., and J. Wroclawski, "Efficient + At-Most-Once Messages Based on Synchronized Clocks", ACM SIGCOMM + '90, Philadelphia, PA, September 1990. + + [RFC-955] Braden, R., "Towards a Transport Service Transaction + Protocol", RFC-955, September 1985. + + [RFC-1185] Jacobson, V., Braden, R., and Zhang, L., "TCP Extension + for High-Speed Paths", RFC-1185, October 1990. + + [RFC-1263] O'Malley, S. and L. Peterson, "TCP Extensions Considered + Harmful", RFC-1263, University of Arizona, October 1991. + + [RFC-1323] Jacobson, V., Braden, R., and Borman, D., "TCP + Extensions for High Performance, RFC-1323, February 1991. + + [RFC-1337] Braden, R., "TIME-WAIT Assassination Hazards in TCP", + RFC-1337, May 1992. + + [STD-007] Postel, J., "Transmission Control Protocol - DARPA + Internet Program Protocol Specification", STD-007, RFC-793, + September 1981. + + + + +Braden [Page 37] + +RFC 1379 Transaction TCP -- Concepts November 1992 + + + [TTCP-FS] Braden, R., "Transaction TCP -- Functional + Specification", Work in Progress, September 1992. + + [Watson81] Watson, R., "Timer-based Mechanisms in Reliable + Transport Protocol Connection Management", Computer Networks, Vol. + 5, 1981. + +Security Considerations + + Security issues are not discussed in this memo. + +Author's Address + + Bob Braden + University of Southern California + Information Sciences Institute + 4676 Admiralty Way + Marina del Rey, CA 90292 + + Phone: (310) 822-1511 + EMail: Braden@ISI.EDU + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Braden [Page 38] + \ No newline at end of file diff --git a/ext/picotcp/RFC/rfc1470.txt b/ext/picotcp/RFC/rfc1470.txt new file mode 100644 index 0000000..5ccb856 --- /dev/null +++ b/ext/picotcp/RFC/rfc1470.txt @@ -0,0 +1,10755 @@ + + + + + + +Network Working Group R. Enger +Request for Comments: 1470 ANS +FYI: 2 J. Reynolds +Obsoletes: 1147 ISI + Editors + June 1993 + + + FYI on a Network Management Tool Catalog: + Tools for Monitoring and Debugging TCP/IP Internets + and Interconnected Devices + +Status of this Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard. Distribution of this memo is + unlimited. + +Abstract + + The goal of this FYI memo is to provide an update to FYI 2, RFC 1147 + [1], which provided practical information to site administrators and + network managers. New and/or updated tools are listed in this RFC. + Additonal descriptions are welcome, and should be sent to: noctools- + entries@merit.edu. + +Introduction + + A static document cannot incorporate references to the latest tools + nor recent revisions to the older catalog entries. To provide a more + timely and responsive information source, the NOCtools catalog is + available on-line via the Internet and Usenet. + + news comp.networks.noctools + ftp wuarchive.wustl.edu:/doc/noctools + + Because of publication delays and other factors, some of the entries + in this catalog may be out of date. The reader is urged to consult + the on-line service to obtain the most up-to-date information. + + The index provided in this document reflects the current contents of + the on-line documentation. + + The NOCtools2 Working Group of the Internet Engineering Task Force + (IETF) has compiled this revised catalog. Future revisions will be + incorporated into the on-line NOCtools catalog. The reader is + encouraged to submit new or revised entries for (near-immediate) + electronic publication. + + + +NOCTools2 Working Group [Page 1] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + The tools described in this catalog are in no way endorsed by the + IETF. For the most part, we have neither evaluated the tools in this + catalog, nor validated their descriptions. Most of the descriptions + of commercial tools have been provided by vendors. Caveat Emptor. + +Acknowledgements + + This catalog is the result of work on the part of the NOCTools2 + Working Group of the User Services Area of the IETF. The following + individuals made especially notable contributions: Chris Myers, + Darren Kinley, Gary Malkin, Mohamed Ellozy, and Mike Patton. + +Current Postings + + The current contents of the NOCtools catalog may be retrieved via + anonymous FTP from wuarchive.wustl.edu. The entries are stored as + individual files in the directory /doc/noctools. + +"No-Writeups" Appendix + + This section contains references to tools which are known to exist, + but which have not been fully cataloged. If anyone wishes to author + an entry for one of these tools please contact us at: + + noctools-request@merit.edu + + Keep in mind that if these or other tools are included in the future, + they will be available in the on-line version of the catalog. + + Each mention is separated by a for improved readability. + If you intend to actually print-out this section of the catalog, then + you should probably strip-out the . + +How to Submit/Update an Entry + + 1) review the template included below to determine what + information you will need to collect, + 2) review the keywords to see what your indexing options are, + 3) assemble (update) catalog entry to include results of + 1) and 2). + 4) Submit your entry using either of the following two methods: + + a) Post your submission to: comp.internet.noctools.submissions + b) Email your submission to: noctools-entries@merit.edu + + New entries will be circulated automatically upon reception. As time + permits, the NOCtools editors will review recent submissions and + incorporate them into the master indexes. Enquiries regarding the + + + +NOCTools2 Working Group [Page 2] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + status of a submission should be E-Mailed to: + + noctools-request@merit.edu + + Those submitting an entry to the catalog should insure that any E- + mail addresses provided are correct and functional. Either the + catalog editors or prospective users of your tool may wish to reach + you. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 3] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +TEMPLATE + + NAME + + + + KEYWORDS + [[,[,...,]]]; + [[,[,...,]]]; + [[,[,...,]]]; + [[,[,...,]]]; + [[,[,...,]]]. + + + ABSTRACT + + + + + + MECHANISM + + + + + + CAVEATS + + + + + + BUGS + + + + + + LIMITATIONS + + + + + + HARDWARE REQUIRED + + + + + + +NOCTools2 Working Group [Page 4] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SOFTWARE REQUIRED + + + + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + + + + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + + + + DATE OF MOST RECENT UPDATE TO THIS CATALOG ENTRY + + +Keywords + + This catalog uses "keywords" for terse characterizations of the + tools. Keywords are abbreviated attributes of a tool or its use. To + allow cross-comparison of tools, uniform keyword definitions have + been developed, and are given below. Following the definitions, + there is an index of catalog entries by keyword. + +Keyword Definitions + + The keywords are always listed in a prefined order, sorted first by + the general category into which they fall, and then alphabetically. + The categories that have been defined for management tool keywords + are: + + o the general management area to which a tool + relates or a tool's functional role; + + o the network resources or components that are + managed; + + o the mechanisms or methods a tool uses to + perform its functions; + + o the operating system and hardware environment + of a tool; and + + o the characteristics of a tool as a hardware + product or software release. + + + + +NOCTools2 Working Group [Page 5] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + The keywords used to describe the general management area or + functional role of a tool are: + + Alarm + a reporting/logging tool that can trigger on specific + events within a network. + + Analyzer + a traffic monitor that reconstructs and interprets pro- + tocol messages that span several packets. + + Benchmark + a tool used to evaluate the performance of network com- + ponents. + + Control + a tool that can change the state or status of a remote + network resource. + + Debugger + a tool that by generating arbitrary packets and moni- + toring traffic, can drive a remote network component to + various states and record its responses. + + Generator + a traffic generation tool. + + Manager + a distributed network management system or system com- + ponent. + + Map + a tool that can discover and report a system's topology + or configuration. + + Reference + a tool for documenting MIB structure or system confi- + guration. + + Routing + a packet route discovery tool. + + Security + a tool for analyzing or reducing threats to security. + + Status + a tool that remotely tracks the status of network com- + ponents. + + + +NOCTools2 Working Group [Page 6] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Traffic + a tool that monitors packet flow. + + The keywords used to identify the network resources or components + that a tool manages are: + + Bridge + a tool for controlling or monitoring LAN bridges. + + CHAOS + a tool for controlling or monitoring implementations of + the CHAOS protocol suite or network components that use + it. + + DECnet + a tool for controlling or monitoring implementations of + the DECnet protocol suite or network components that + use it. + + DNS + a Domain Name System debugging tool. + + Ethernet + a tool for controlling or monitoring network components + on ethernet LANs. + + FDDI + a tool for controlling or monitoring network components + on FDDI LANs or WANs. + + IP + a tool for controlling or monitoring implementations of + the TCP/IP protocol suite or network components that + use it. + + OSI + a tool for controlling or monitoring implementations of + the OSI protocol suite or network components that use + it. + + NFS + a Network File System debugging tool. + + Ring + a tool for controlling or monitoring network components + on Token Ring LANs. + + + + + +NOCTools2 Working Group [Page 7] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SMTP + an SMTP debugging tool. + + Star + a tool for controlling or monitoring network components + on StarLANs. + + The keywords used to describe a tool's mechanism are: + + CMIS + a network management system or component based on + CMIS/CMIP, the Common Management Information System and + Protocol. + + Curses + a tool that uses the "curses" tty interface package. + + Eavesdrop + a tool that silently monitors communications media + (e.g., by putting an ethernet interface into "promiscu- + ous" mode). + + NMS + the tool is a component of or queries a Network Manage- + ment System. + + Ping + a tool that sends packet probes such as ICMP echo mes- + sages; to help distinguish tools, we do not consider + NMS queries or protocol spoofing (see below) as probes. + + Proprietary + a distributed tool that uses proprietary communications + techniques to link its components. + + RMON + a tool which employs the RMON extensions to SNMP. + + SNMP + a network management system or component based on SNMP, + the Simple Network Management Protocol. + + Spoof + a tool that tests operation of remote protocol modules + by peer-level message exchange. + + X + a tool that uses X-Windows. + + + +NOCTools2 Working Group [Page 8] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + The keywords used to describe a tool's operating environment are: + + DOS + a tool that runs under MS-DOS. + + HP + a tool that runs on Hewlett-Packard systems. + + Macintosh + a tool that runs on Macintosh personal computers. + + OS/2 + a tool that runs under the OS/2 operating system. + + Standalone + an integrated hardware/software tool that requires only + a network interface for operation. + Sun + a tool that runs on Sun Microsystems platforms. + (binary distribution built for use on a Sun.) + + UNIX + a tool that runs under 4.xBSD UNIX or related OS. + + VMS + a tool that runs under DEC's VMS operating system. + + The keywords used to describe a tool's characteristics as a hardware + or software acquisition are: + + Free + a tool is available at no charge, though other restric- + tions may apply (tools that are part of an OS distribu- + tion but not otherwise available are not listed as + "free"). + + Library + a tool packaged with either an Application Programming + Interface (API) or object-level subroutines that may be + loaded with programs. + + Sourcelib + a collection of source code (subroutines) upon which + developers may construct other tools. + + + + + + + +NOCTools2 Working Group [Page 9] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +Tools Indexed by Keywords + + Following is an index of the most up-to-date catalog entries sorted + by keyword, which is available via: + + news comp.networks.noctools.tools + ftp wuarchive.wustl.edu:/doc/noctool + + This index can be used to locate the tools with a particular + attribute: tools are listed under each keyword that characterizes + them. The keywords and the subordinate lists of tools under them are + in alphabetical order. + + Alarm + ----- + CMIP Library + Dual Manager + Eagle + EMANATE + EtherMeter + LanProbe + LANWatch + MONET + NetMetrix Load Monitor + NetMetrix Protocol Analyzer + NETMON for Windows + NETscout + NOCOL + SNMP Libraries and Utilities from Empire Technologies + SNMP Libraries and Utilities from SNMP Research + snmpd from Empire Technologies + SpiderMonitor + XNETMON from SNMP Research + xnetmon from Wellfleet + + Analyzer + -------- + LANVista + LANWatch + NetMetrix Protocol Analyzer + NETscout + PacketView + Sniffer + SpiderMonitor + + + + + + + +NOCTools2 Working Group [Page 10] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Benchmark + --------- + hammer & anvil + iozone + LADDIS + LANVista + nhfsstone + SPIMS + spray + ttcp + XNETMON from SNMP Research + + CMIS + ---- + CMIP library + Generic Managed System + MIB Browser + + Control + ------- + CMIP Library + Dual Manager + Eagle + MIB Manager from Empire Technologies + MONET + NETMON for Windows + proxyd + SNMP Libraries and Utilities from Empire Technologies + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System + snmpd from Empire Technologies + TokenVIEW + XNETMON from SNMP Research + + Debugger + -------- + Ethernet Box II + LANVista + NetMetrix Traffic Generator + ping from UCB + SPIMS + XNETMON from SNMP Research + + Generator + --------- + hammer & anvil + LADDIS + LANVista + + + +NOCTools2 Working Group [Page 11] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + NetMetrix Traffic Generator + nhfsstone + ping + ping from UCB + Sniffer + SpiderMonitor + spray + TTCP + + Manager + ------- + Beholder + CMIP Library + CMU SNMP Distribution + decaddrs by Wellfleet + Dual Manager + EMANATE + Ethernet Box II + getone by Wellfleet + Interactive Network Map + LanProbe + LANVista + MIB Manager from Empire Technologies + MONET + NetLabs CMOT Agent + NetLabs SNMP Agent + NETMON for Windows + NETscout + NNStat + NOCOL + OverVIEW + SAS/CPE for Open Systems Software + SNMP Development Kit + SNMP Libraries and Utilities from Empire Technologies + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System + snmpd from Empire Technologies + tokenview + Tricklet + Wollongong-Manager + XNETMON from SNMP Research + XNETMON from Wellfleet + xnetperfmon + + Map + --- + decaddrs by Wellfleet + Dual Manager + + + +NOCTools2 Working Group [Page 12] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + etherhostprobe + EtherMeter + Interactive Network Map + LanProbe + NETMON for Windows + Network Integrator I + NPRV + SNMP Libraries and Utilities from SNMP Research + XNETMON by SNMP Research + XNETMON by Wellfleet + + Reference + --------- + EMANATE + ethernet-codes + HyperMIB + MIB Manager from Empire Technologies + XNETMON + + Routing + ------- + arp + decaddrs by Wellfleet + etherhostprobe + getone by Wellfleet + hopcheck + MONET + net_monitor + NETMON for Windows + netstat + NPRV + ping from UCB + query + traceroute + + Security + -------- + Computer Security Checklist + Dual Manager + Eagle + EMANATE + LAN Patrol + SNMP Libraries and Utilities from SNMP Research + XNETMON by SNMP Research + xnetperfmon + + + + + + +NOCTools2 Working Group [Page 13] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Status + ------ + Beholder + CMIP Library + CMU SNMP + DiG + dnsstats + doc + Dual Manager + EMANATE + fping + getone by Wellfleet + host + Internet Rover + lamers + LanProbe + mconnect + MONET + net_monitor + Netlabs CMOT Agent + Netlabs SNMP Agent + NETscout + NNStat + NOCOL + NPRV + OverVIEW + ping + ping from UCB + proxyd from SNMP Research + SAS/CPE + SNMP Development Kit + SNMP Libraries and Utilities from Empire Technologies + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System + PSI SNMP + snmpd from Empire Technologies + snmpd from SNMP Research + TokenVIEW + Tricklet + vrfy + XNETMON by SNMP Research + xnetmon by Wellfleet + xnetperfmon + xup + + + + + + + +NOCTools2 Working Group [Page 14] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Traffic + ------- + etherfind + EtherMeter + Ethernet Box II + EtherView + getethers + LAN Patrol + LanProbe + LANVista + LANWatch + ENTM + MONET + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + NETMON by Mitre + NETscout + netwatch + Network Integrator I + nfswatch + nhfsstone + NNStat + ositrace + PacketView + Sniffer + SpiderMonitor + spray + tcpdump + tcplogger + trpt + ttcp + XNETMON by SNMP Research + + Bridge + ------ + decaddrs by Wellfleet + EMANATE + MIB Manager from Empire Technologies + MONET + proxyd by SNMP Research + SAS/CPE + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System + snmpd from SNMP Research + XNETMON from SNMP Research + + + + +NOCTools2 Working Group [Page 15] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CHAOS + ----- + Interactive Network Map + LANWatch + + DECnet + ------ + decaddrs by Wellfleet + LANVista + LANWatch + MONET + net_monitor + NetMetrix Protocol Analyzer + NETMON for Windows + NETscout + Sniffer + SNMP Libraries and Utilities from SNMP Research + SpiderMonitor + XNETMON from SNMP Research + xnetperfmon from SNMP Research + + DNS + --- + DiG + dnsstats + doc + lamers + LANWatch + NetMetrix Protocol Analyzer + NOCOL + + Ethernet + -------- + arp + Beholder + Eagle + EMANATE + etherfind + etherhostprobe + EtherMeter + Ethernet Box II + ethernet-codes + EtherView + getethers + LAN Patrol + LanProbe + LANVista + LANWatch + + + +NOCTools2 Working Group [Page 16] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + ENTM + Interactive Network Map + MONET + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + NETMON for Windows + NETscout + netwatch + Network Integrator I + nfswatch + NNStat + PacketView + proxyd from SNMP Research + SAS/CPE + Sniffer + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + snmpd from SNMP Research + SpiderMonitor + tcpdump + XNETMON from SNMP Research + xnetperfmon from SNMP Research + + FDDI + ---- + EMANATE + ethernet-codes + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + nfswatch + SAS/CPE + SNMP Libraries and utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + snmpd from SNMP Research + XNETMON from SNMP Research + + IP + -- + arp + CMU SNMP + Dual Manager + Eagle + EMANATE + etherfind + + + +NOCTools2 Working Group [Page 17] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + etherhostprobe + EtherView + fping + getone from Wellfleet + hammer & anvil + hopcheck + Internet Rover + LanProbe + LANVista + LANWatch + ENTM + Interactive Network Map + MIB Manager from Empire Technologies + MONET + net_monitor + Netlabs CMOT Agent + Netlabs SNMP Agent + NetMetrix Load Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + NETMON by Mitre + NETMON for Windows + NETscout + netstat + netwatch + nfswatch + nhfsstone + NNStat + NOCOL + NPRV + OverVIEW + PacketView + ping + ping from UCB + proxyd from SNMP Research + query + SAS/CPE + SNMP Development Kit + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + PSI SNMP + snmpd from Empire Technologies + snmpd from SNMP Research + PSI SNMP + SpiderMonitor + SPIMS + spray + tcpdump + + + +NOCTools2 Working Group [Page 18] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + tcplogger + traceroute + trpt + ttcp + XNETMON from SNMP Research + xnetmon from Wellfleet + xnetperfmon from SNMP Research + + OSI + --- + CMIP Library + Dual Manager + EMANATE + LANVista + LANWatch + Netlabs CMOT Agent + NetMetrix Protocol Analyzer + NETMON for Windows + NETscout + NOCOL + ositrace + proxyd from SNMP Research + SAS/CPE + Sniffer + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + snmpd from SNMP Research + SpiderMonitor + SPIMS + XNETMON from SNMP Research + xnetperfmon from SNMP Research + + NFS + --- + etherfind + EtherView + iozone + LADDIS + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NETscout + nfswatch + nhfsstone + Sniffer + tcpdump + + + + + + +NOCTools2 Working Group [Page 19] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Ring + ---- + Eagle + EMANATE + Interactive Network Map + LANVista + LANWatch + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + NETMON by Mitre + NETMON for Windows + NETscout + netwatch + PacketView + proxyd from SNMP Research + Sniffer + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + snmpd from SNMP Research + TokenVIEW + XNETMON from SNMP Research + xnetperfmon from SNMP Research + + SMTP + ---- + host + Internet Rover + LANWatch + mconnect + NetMetrix Protocol Analyzer + Sniffer + vrfy + + Star + ---- + EMANATE + Interactive Network Map + LAN Patrol + LANWatch + NETMON for Windows + NETscout + proxyd from SNMP Research + Sniffer + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + snmpd from SNMP Research + + + +NOCTools2 Working Group [Page 20] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + XNETMON from SNMP Research + xnetperfmon from SNMP Research + + Curses + ------ + Eagle + Internet Rover + net_monitor + nfswatch + NOCOL + PSI SNMP + + Eavesdrop + --------- + etherfind + Ethernet Box II + EtherView + LAN Patrol + LANVista + LANWatch + ENTM + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetNetrix Traffic Generator + NETMON from Mitre + NETscout + netwatch + nfswatch + NNStat + OSITRACE + PacketView + Sniffer + SpiderMonitor + tcplogger + trpt + + NMS + --- + CMU SNMP + decaddrs from Wellfleet + Dual Manager + EMANATE + EtherMeter + Ethernet Box II + getone from Wellfleet + Interactive Network Map + MONET + + + +NOCTools2 Working Group [Page 21] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Netlabs CMOT Agent + Netlabs SNMP Agent + NETMON for Windows + NETscout + NNStat + NOCOL + OverVIEW + proxyd from SNMP Research + SNMP Development Kit + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + PSI SNMP + snmpd from Empire Technologies + snmpd from SNMP Research + TokenVIEW + XNETMON from SNMP Research + xnetmon from Wellfleet + xnetperfmon from SNMP Research + + Ping + ---- + etherhostprobe + fping + getethers + hopcheck + Interactive Network Map + Internet Rover + LANWatch + net_monitor + NOCOL + NPRV + ping + ping from UCB + spray + traceroute + ttcp + XNETMON from SNMP Research + xup + + Proprietary + ----------- + Eagle + EtherMeter + Ethernet Box II + LanProbe + LANVista + TokenVIEW + + + + +NOCTools2 Working Group [Page 22] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + RMON + ---- + Beholder + + SNMP + ---- + Beholder + CMU SNMP + decaddrs from Wellfleet + Dual Manager + EMANATE + getone from Wellfleet + Interactive Network Map + MIB Manager from Empire Technologies + MONET + Netlabs SNMP Agent + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + NETMON for Windows + NETscout + NOCOL + OverVIEW + proxyd from SNMP Research + SNMP Development Kit + SNMP Libraries and utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + PSI SNMP + snmpd from Empire Technologies + snmpd from SNMP Research + Wollongong-Manager + XNETMON from SNMP Research + xnetmon from Wellfleet + xnetperfmon from SNMP Research + + Spoof + ----- + DiG + doc + Internet Rover + host + LADDIS + mconnect + nhfsstone + NOCOL + query + SPIMS + + + +NOCTools2 Working Group [Page 23] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + vrfy + + X + - + Dual Manager + Interactive Network Map + MIB Manager from Empire Technologies + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + SAS/CPE + PSI SNMP + XNETMON from SNMP Research + xnetperfmon from SNMP Research + xup + + DEC + --- + Wollongong-Manager + + DOS + --- + Computer Security Checklist + Ethernet Box II + hammer & anvil + hopcheck + iozone + LAN Patrol + LANVista + netmon + NETMON for Windows + netwatch + OverVIEW + PacketView + ping + SAS/CPE + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + snmpd from SNMP Research + TokenVIEW + Wollongong-Manager + xnetperfmon from SNMP Research + + + + + + + + +NOCTools2 Working Group [Page 24] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HP + -- + iozone + SAS/CPE + xup + + Macintosh + --------- + HyperMIB + + OS/2 + ---- + Beholder + Tricklet + + Standalone + ---------- + LANVista + Sniffer + SNMP Packaged Agent System from SNMP Research + SpiderMonitor + + Sun + --- + Avatar SunSNMPD + Wollongong Manager + + UNIX + ---- + arp + CMIP Library + CMU SNMP + decaddrs from Wellfleet + DiG + doc + dnsstats + Eagle + etherfind + etherhostprobe + EtherView + fping + getethers + getone from Wellfleet + host + Interactive Network Map + Internet Rover + iozone + LADDIS + + + +NOCTools2 Working Group [Page 25] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + lamers + mconnect + MIB Manager from Empire Technologies + MONET + net_monitor + Dual Manager + NetMetrix Load Monitor + NetMetrix NFS Monitor + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + NETMON from Mitre + NETscout + netstat + Network Integrator I + nfswatch + nhfsstone + NNStat + NOCOL + OSITRACE + ping + ping from UCB + proxyd from SNMP Research + query + SAS/CPE + SNMP Development Kit + SNMP Libraries and Utilities from Empire Technologies + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + PSI SNMP + snmpd from Empire Technologies + snmpd from SNMP Research + SPIMS + spray + tcpdump + tcplogger + traceroute + Tricklet + trpt + ttcp + vrfy + XNETMON from SNMP Research + xnetmon from Wellfleet + xnetperfmon from SNMP Research + + VMS + --- + arp + ENTM + + + +NOCTools2 Working Group [Page 26] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + fping + net_monitor + netstat + NPRV + ping + SNMP Libraries and Utilities from SNMP Research + tcpdump + traceroute + ttcp + xnetperfmon from SNMP Research + + Free + ---- + arp + Beholder + CMIP Library + CMU SNMP Distribution + DiG + dnsstats + doc + ENTM + fping + getethers + hammer & anvil + hopcheck + host + Interactive Network Map + Internet Rover + iozone + lamers + net_monitor + netmon from Mitre + netstat + netwatch + nfswatch + nhfsstone + NNStat + NOCOL + NPRV + OSITRACE + PING + ping from UCB + query + SNMP Development Kit + tcpdump + tcplogger + traceroute + Tricklet + + + +NOCTools2 Working Group [Page 27] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + trpt + ttcp + vrfy + + Library + ------- + CMIP Library + CMU SNMP + Dual Manager + NetMetrix Protocol Analyzer + NetMetrix Traffic Generator + proxyd from SNMP Research + SAS/CPE + + Sourcelib + --------- + Beholder + CMIP Library + CMU SNMP + EMANATE + HyperMIB + Interactive Network Map + Internet Rover + LANWatch + MIB Manager from Empire Technologies + net_monitor + NETMON for Windows + NOCOL + proxyd from SNMP Research + SNMP Development Kit + SNMP Libraries and Utilities from Empire Technologies + SNMP Libraries and Utilities from SNMP Research + SNMP Packaged Agent System from SNMP Research + snmpd from SNMP Research + SpiderMonitor + Tricklet + XNETMON from SNMP Research + xnetperfmon from SNMP Research + +Tool Descriptions + + This section is an updated collection of brief descriptions of tools + for managing TCP/IP internets. These entries are in alphabetical + order, by tool name. + + The entries all follow a standard format. Immediately after the NAME + of a tool are its associated KEYWORDS. Keywords are terse + descriptions of the purposes or attributes of a tool. A more + + + +NOCTools2 Working Group [Page 28] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + detailed description of a tool's purpose and characteristics is given + in the ABSTRACT section. The MECHANISM section describes how a tool + works. In CAVEATS, warnings about tool use are given. In BUGS, + known bugs or bug-report procedures are given. LIMITATIONS describes + the boundaries of a tool's capabilities. HARDWARE REQUIRED and + SOFTWARE REQUIRED relate the operational environment a tool needs. + Finally, in AVAILABILITY, pointers to vendors, online repositories, + or other sources for a tool are given. + + Where tool names conflict, the vendor name is used as well. For + example, MITRE, and SNMP Research each submitted an updated + description of a tool called, "NETMON". These tools were + independently developed, are functionally different, and run in + different environments. MITRE's tool is listed as "NETMON_MITRE," + and the tool from SNMP Research as "NETMON_WINDOWS_SNMP_RESEARCH". + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 29] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog ARP + + NAME + arp + + KEYWORDS + routing; ethernet, IP;; UNIX, VMS; free. + + ABSTRACT + Arp displays and can modify the internet-to-ethernet + address translations tables used by ARP, the address + resolution protocol. + + MECHANISM + The arp program accesses operating system memory to + read the ARP data structures. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + Only the super user can modify ARP entries. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX or related OS, or VMS. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + + Available via anonymous FTP from uunet.uu.net, in + directory bsd-sources/src/etc. Available with 4.xBSD + UNIX and related operating systems. For VMS, available + as part of TGV MultiNet IP software package, as well as + Wollongong's WIN/TCP and Process Software Corporation's + TCPware for VMS. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + This entry maintained by the NOCtools editors. + Send email to noctools-request@merit.edu. + + + + + + + +NOCTools2 Working Group [Page 30] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog AVATAR-SNMP-TOOLKIT + + NAME + SNMP Application Development Toolkit + + KEYWORDS + manager;;SNMP;;sourcelib. + + ABSTRACT + snmpapi is an api toolkit for developing SNMP + applications and agents. The toolkit is simple and + very fast that can be used for any type of + application. It is very well suited for embedded + systems such as bridges or routers. An example MIB II + agent for Sun Sparcstations is provided. snmpapi is + distributed in source form only. + + MECHANISM + snmpapi is a library of C functions. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None. + + HARDWARE REQUIRED + No restrictions. + + AVAILABILITY + Available now. For more information, send e-mail to + info@avatar.com. + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 31] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog AVATAR-SUNSNMPD + + NAME + sunsnmpd + + KEYWORDS + manager;;snmp;sun;. + + ABSTRACT + sunsnmpd is a fully supported SNMP agent with MIB II + support for Sun Sparscations running SunOS 4.1 or + higher. sunsnmpd supports both SNMP GET and SET + operations. + + MECHANISM + sundnmpd is a daemon process which starts up at boot + time from the rc.local file. It uses /dev/kmem to access + kernel structures. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + Must be started by a super user. + + HARDWARE REQUIRED + Sun Sparcstations. + + AVAILABILITY + Available now. Site licensing only. For more information, + send e-mail to info@avatar.com. + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 32] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog ChameLAN-100 + + NAME + ChameLAN 100 + + KEYWORDS + analyzer, benchmark, debugger, generator, map, + reference, status, traffic; bridge, DECnet, ethernet, + FDDI, IP, OSI, NFS, ring; eavesdrop, SNMP, X; + standalone, UNIX. + + ABSTRACT + + Tekelec's ChameLAN 100 is a portable diagnostic system + for monitoring and simulation of FDDI, Ethernet and + Token Ring networks -- simultaneously. Protocol + analysis of multiple topologies, as well as mixed + topoloies simultaneously, is a key feature of + the product family. Tekelec's proprietary FDDI + hardware guarantees complete real-time analysis of + networks and network components at the full ring + bandwidth of 125 Mbps. It passively connects to the + network and captures 100 percent of the data, measures + performance and isolates real-time problems. + + The simulation option offers full bandwidth load + generation that allows you to create and simulate any + network condition. It gives you the ability to inject + errors and misformed frames. A set of + confidence tests allow simple evaluation of new + equipment. A ring map feature displays network + topology and status of all nodes via the SMT + process. + + Monitoring of FDDI, Ethernet and Token Ring allows the + user to: view network status in real time; view + network, node, or node pair statistics; capture + frames; control capture using trigger and filter + capabilities; view real-time statistics; view captured + frames in decoded format; and view the last frame + transmitted by each station. + + The following Real-Time Network Statistics of FDDI, + Ethernet and Token Ring networks is displayed: frame + rate, runts, byte rate, jabbers, CRC/align errors, and + collisions. + + Product developers can use the ChameLAN 100 to observe + + + +NOCTools2 Working Group [Page 33] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + and control various events to help debug their FDDI, + Ethernet and Token Ring products. End users can + perform real-time monitoring to test and + diagnose problems that may occur when developing, + installing or managing FDDI, Ethernet and Token Ring + networks and network products. End users can use the + ChameLAN 100 to aid in the installation and + maintenance of Ethernet and Token Ring networks. To + isolate specific network trouble spots the ChameLAN + 100 uses filtering and triggering techniques for data + capture. Higher level protocol decode includes + TCP/IP, OSI and DECnet protocol suites. Protocol + decode of IPX, SNMP, XTP, and AppleTalk are also + supported. Development of additional protocol decodes + is also under development. The ChameLAN 100 family + also offers a Protocol Management Development System + (PMDS) that enables users to develop custom protocol + decode suites. + + The FDDI, Ethernet and Token Ring hardware interfaces + feature independent processing power. Real-time data + is monitored unobtrusively at full bandwidth without + affecting network activity. Real-time data may also + be saved to a 120MB or optional 200MB hard disk drive + for later analysis. FDDI data is captured at 125 megabits + per second (Mbps), Ethernet at 10 Mbps and Token Ring + at 4 or 16 Mbps. + + MECHANISM + This portable, standalone unit incorporates the power + of UNIX, X-Windows and Motif. Its UNIX-based + programming interface facilitates development of + customized monitoring and simulation applications. The + ChameLAN 100 may connect to the network at any + location using standard equipment. Standard graphical + Motif/X-Windows and TCP/IP allow remote control + through Ethernet and 10Base T interfaces. Tekelec + also offers a rackmounted model -- ChameLAN 100-X. + Both models can be controlled via a Sun Workstation + remotely. + + CAVEATS + none. + + BUGS + none known. + + + + + +NOCTools2 Working Group [Page 34] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LIMITATIONS + none reported. + + HARDWARE REQUIRED + None. The ChameLAN 100 is a self-contained unit, and + includes its own interface cards. It installs + into a network with standard interface + connectors. + + SOFTWARE REQUIRED + None. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + The ChameLAN 100 product famil y is available + commercially. For more information or a free demo, + call or write: + + 1.800.tek.elec + Tekelec + 26580 West Agoura Road + Calabasas, CA 91302 + Phone: 818.880.5656 + Fax: 818.880.6993 + + The ChameLAN 100 is listed on the GSA schedule. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Todd Koch + Public Relations Specialist + 818.880.7718 + Internet: todd.koch@tekelec.com + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 35] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog CMU_SNMP + + NAME + The CMU SNMP Distribution + + KEYWORDS + manager, status; IP; NMS, SNMP; UNIX; free, sourcelib. + + ABSTRACT + The CMU SNMP Distribution includes source code for an + SNMP agent, several SNMP client applications, an ASN.1 + library, and supporting documentation. + + The agent compiles into about 10 KB of 68000 code. The + distribution includes a full agent that runs on a + Kinetics FastPath2/3/4, and is built into the KIP + appletalk/ethernet gateway. The machine independent + portions of this agent also run on CMU's IBM PC/AT + based router. + + The applications are designed to be useful in the real + world. Information is collected and presented in a + useful format and is suitable for everyday status + monitoring. Input and output are interpreted + symbolically. The tools can be used without + referencing the RFCs. + + MECHANISM + SNMP. + + CAVEATS + None. + + BUGS + None reported. Send bug reports to + sw0l+snmp@andrew.cmu.edu. ("sw0l" is "ess double-you + zero ell.") + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + The KIP gateway agent runs on a Kinetics FastPath2/3/4. + Otherwise, no restrictions. + + SOFTWARE REQUIRED + The code was written with efficiency and portability in + mind. The applications compile and run on the follow- + + + +NOCTools2 Working Group [Page 36] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + ing systems: IBM PC/RT running ACIS Release 3, Sun3/50 + running SUNOS 3.5, and the DEC microVax running Ultrix + 2.2. They are expected to run on any system with a + Berkeley socket interface. + + AVAILABILITY + This distribution is copyrighted by CMU, but may be + used and sold without permission. Consult the copy- + right notices for further information. The distribu- + tion is available by anonymous FTP from the host + lancaster.andrew.cmu.edu (128.2.13.21) as the files + pub/cmu-snmp.9.tar, and pub/kip-snmp.9.tar. The former + includes the libraries and the applications, and the + latter is the KIP SNMP agent. + + Please direct questions, comments, and bug reports to + sw0l+snmp@andrew.cmu.edu. ("sw0l" is "ess double-you + zero ell.") If you pick up this package, please send a + note to the above address, so that you may be notified + of future enhancements/changes and additions to the set + of applications (several are planned). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 37] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog COMPUTER-SECURITY-CHECKLIST + + NAME + Computer Security Checklist + + KEYWORDS + security; DOS. + + ABSTRACT + This program consists of 858 computer security ques- + tions divided up in thirteen sections. The program + presents the questions to the user and records their + responses. After answering the questions in one of the + thirteen sections, the user can generate a report from + the questions and the user's answers. The thirteen + sections are: telecommunications security, physical + access security, personnel security, systems develop- + ment security, security awareness and training prac- + tices, organizational and management security, data and + program security, processing and operations security, + ergonomics and error prevention, environmental secu- + rity, and backup and recovery security. + + The questions are weighted as to their importance, and + the report generator can sort the questions by weight. + This way the most important issues can be tackled + first. + + MECHANISM + The questions are displayed on the screen and the user + is prompted for a single keystroke reply. When the end + of one of the thirteen sections is reached, the answers + are written to a disk file. The question file and the + answer file are merged to create the report file. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + No restrictions. + + + + + +NOCTools2 Working Group [Page 38] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SOFTWARE REQUIRED + DOS operating system. + + AVAILABILITY + A commercial product available from: + C.D., Ltd. + P.O. Box 58363 + Seattle, WA 98138 + (206) 243-8700 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 39] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog CMIP-LIBRARY + + NAME + CMIP Library + + KEYWORDS + manager; osi; cmis; unix; free, sourcelib. + + ABSTRACT + + The CMIP Library implements the functionality of the + Common Management Information Service/Protocol as in + the full international standards (ISO 9595, ISO 9596) + published in 1990. It is designed to work with the + ISODE package and can act as a building block for the + construction of CMIP-based agent and manager + applications. + + MECHANISM + The CMIP library uses ISO ROS, ACSE and ASN.1 + presentation, as implemented in ISODE, to provide its + service. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None known. + + HARDWARE REQUIRED + Has been tested on SUN 3 and SUN 4 architectures. + + SOFTWARE REQUIRED + The ISODE protocol suite, BSD UNIX. + + AVAILABILITY + The CMIP library and related management tools built + upon it, known as OSIMIS (OSI Management Information + Service), are publicly available from University + College London, England via FTP and FTAM. To obtain + information regarding a copy send email to + osimis-request@cs.ucl.ac.uk or call +44 71 380 7366. + + + + + + +NOCTools2 Working Group [Page 40] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog DECADDRS + + NAME + decaddrs, decaroute, decnroute, xnsroutes, bridgetab + + KEYWORDS + manager, map, routing; bridge, DECnet; NMS, SNMP; UNIX. + + ABSTRACT + These commands display private MIB information from + Wellfleet systems. They retrieve and format for + display values of one or several MIB variables from the + Wellfleet Communications private enterprise MIB, using + the SNMP (RFC1098). In particular these tools are used + to examine the non-IP modules (DECnet, XNS, and Bridg- + ing) of a Wellfleet system. + + Decaddrs displays the DECnet configuration of a + Wellfleet system acting as a DECnet router, showing the + static parameters associated with each DECnet inter- + face. Decaroute and decnroute display the DECnet + inter-area and intra-area routing tables (that is area + routes and node routes). Xnsroutes displays routes + known to a Wellfleet system acting as an XNS router. + Bridgetab displays the bridge forwarding table with the + disposition of traffic arriving from or directed to + each station known to the Wellfleet bridge module. All + these commands take an IP address as the argument and + can specify an SNMP community for the retrieval. One + SNMP query is performed for each row of the table. + Note that the Wellfleet system must be operating as an + IP router for the SNMP to be accessible. + + MECHANISM + Management information is exchanged by use of SNMP. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Distributed and supported for Sun 3 systems. + + + + +NOCTools2 Working Group [Page 41] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SOFTWARE REQUIRED + Distributed and supported for SunOS 3.5 and 4.x. + + AVAILABILITY + Commercial product of: + Wellfleet Communications, Inc. + 12 DeAngelo Drive + Bedford, MA 01730-2204 + (617) 275-2400 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 42] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog DIG + + NAME + DiG + + KEYWORDS + status; DNS; spoof; UNIX; free. + + ABSTRACT + DiG (domain information groper), is a command line tool + which queries DNS servers in either an interactive or a + batch mode. It was developed to be more + convenient/flexible than nslookup for gathering perfor- + mance data and testing DNS servers. + + MECHANISM + Dig is built on a slightly modified version of the bind + resolver (release 4.8). + + CAVEATS + none. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX. + + AVAILABILITY + DiG is available via anonymous FTP from venera.isi.edu + in pub/dig.2.0.tar.Z. + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 43] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog EMANATE_SNMP_RESEARCH + + NAME + EMANATE: Enhanced MANagement Agent Through Extensions + from SNMP Research. + + KEYWORDS + alarm, control, manager, reference, security, status; + bridge, Ethernet, FDDI, IP, OSI, ring, star; + NMS, SNMP; + sourcelib. + + ABSTRACT + The EMANATE system provides a run-time extensible SNMP + agent that dynamically reconfigures an agent's MIB + without having to recompile, relink, or restart the + agent. An EMANATE capable SNMP agent can support zero, + one, or many subagents and dynamically reconfigure to + connect or disconnect those subagents' MIBs. + + The EMANATE system consists of several logically + independent components and subsystems: + + o Master SNMP agent which contains an API to communicate + with subagents. + o Subagents which implement various MIBS. + o Subagent Developer's Kit which contains tools to assist + in the implementation of subagents. + o EMANATE libraries which provide the API for the + subagent. + + MECHANISM + A concise API allows a standard means of communication + between the master and subagents. System dependent + mechanisms are employed for transfer of information + between the master and subagents. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + + + + + +NOCTools2 Working Group [Page 44] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HARDWARE REQUIRED + Multiple platforms including PC's, workstations, hosts, + and servers are supported. Contact SNMP Research for + more details. + + SOFTWARE REQUIRED + C compiler. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 45] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog ETHERFIND_SUN + + NAME + etherfind + + KEYWORDS + traffic; ethernet, IP, NFS; eavesdrop; UNIX. + + ABSTRACT + Etherfind examines the packets that traverse a network + interface, and outputs a text file describing the + traffic. In the file, a single line of text describes + a single packet: it contains values such as protocol + type, length, source, and destination. Etherfind can + print out all packet traffic on the ethernet, or + traffic for the local host. Further packet filtering + can be done on the basis of protocol: IP, ARP, RARP, + ICMP, UDP, ND, TCP, and filtering can also be done + based on the source, destination addresses as well as + TCP and UDP port numbers. + + MECHANISM + In usual operations, and by default, etherfind puts the + interface in promiscuous mode. In 4.3BSD UNIX and + related OSs, it uses a Network Interface Tap (NIT) to + obtain a copy of traffic on an ethernet interface. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + Minimal protocol information is printed. Can only be + run by the super user. The syntax is painful. + + HARDWARE REQUIRED + Ethernet. + + SOFTWARE REQUIRED + SunOS. + + AVAILABILITY + Executable included in Sun OS "Networking Tools and + Programs" software installation option. + + + + + +NOCTools2 Working Group [Page 46] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog ETHERNET-CODES + + NAME + ethernet-codes + + KEYWORDS + reference; + ethernet, fddi; + ; + ; + ; + + ABSTRACT + Mike Patton of MIT LCS has compiled a very + comprehensive list of the IEEE numbers used on + Ethernet and FDDI (with some permutation). + This file contains collected information on the + various codes used on IEEE 802.3 and EtherNet. + There are three "pages": type codes, vendor + codes, and the uses of multicast (including + broadcast) addresses. + + MECHANISM + FTP the file and use it like a secret decoder ring. + + CAVEATS + Since this information is from collected wisdom, + there are certainly omissions. + + BUGS + Mike welcomes any further additions. + They can be sent to a special mailbox that he has set up: + + MAP=EtherNet-codes@LCS.MIT.Edu + + LIMITATIONS + See caveats. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + No restrictions. + + + + + + + + +NOCTools2 Working Group [Page 47] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + The file is stored as flat, non-compressed ASCII text. + It can be FTP'ed from: + ftp.lcs.mit.edu + + Retreive the file: + /pub/map/EtherNet-codes + + To submit additions or obtain further assistance, send email to: + MAP=EtherNet-codes@LCS.MIT.Edu + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + This entry maintained by the NOCtools editors. + Send email to noctools-request@merit.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 48] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog GENERIC-MANAGED-SYSTEM + + NAME + Generic Managed System + + KEYWORDS + manager; osi; cmis; unix; free, sourcelib + + ABSTRACT + The Generic Managed System (GMS) implements the + functions that would be common to any OSI managed + system. These include the parseing of CMIS requests, + selection of managed objects according to the scoping + and filtering rules, handling of notifications and + event forwarding discriminators etc. The intention is + that the implementors should use the GMS as a basis + for their own managed object implementations. A + support environment is provided to assist with this. + + MECHANISM + The GMS uses the UCL CMIP library plus a library of + C++ objects representing common managed objects and + attribute types. + + CAVEATS + The system is still experimental, is subject to change + and is not yet well documented. + + BUGS + See above. + + LIMITATIONS + None known. + + HARDWARE REQUIRED + Has been tested on SUN 3 and SUN 4 architectures. + + SOFTWARE REQUIRED + The ISODE protocol suite, BSD UNIX, UCL CMIP Library, + GNU C++ (g++). + + AVAILABILITY + The CMIP library and related management tools built + upon it, known as OSIMIS (OSI Management Information + Service), are publicly available from University + College London, England via FTP and FTAM. To obtain + information regarding a copy send email to + osimis-request@cs.ucl.ac.uk or call +44 71 380 7366. + + + +NOCTools2 Working Group [Page 49] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog GETETHERS + + NAME + getethers + + KEYWORDS + Traffic; Ethernet; Ping; UNIX; Free + + ABSTRACT + Getethers runs through all addresses on an ethernet + segment (a.b.c.1 to a.b.c.254) and pings each address, + and then determines the ethernet address for that + host. It produces a list, in either plain ASCII, the + file format for the Excelan Lanalyzer, or the file + format for the Network General Sniffer, of + hostname/ethernet address pairs for all hosts on the + local nework. The plain ASCII list optionally + includes the vendor name of the ethernet card in + each system, to aid in the determination of the + identity of unknown systems. + + MECHANISM + Getethers uses a raw IP socket to generate ICMP echo + requests and receive ICMP echo replies, and then + examines the kernel ARP table to determine the + ethernet address of each responding system. + + CAVEATS + Assumes that the ethernet it is looking at is either + a Class C IP network, or part of a Class B IP network + that is subnetted with a netmask of 255.255.255.0. + (This is easy to change, but it's compiled in.) + + BUGS + None known. + + LIMITATIONS + None. + + HARDWARE REQUIRED + Has been tested on Sun-3 and Sun-4 (SPARC) systems + under SunOS 4.1.x, DEC VAXes under 4.3BSD. + + SOFTWARE REQUIRED + Runs under SunOS 4.x and 4.3BSD; should be easy to + port to any other Berkeley-like system. Requires + raw sockets and the ioctl calls to get at the ARP + table. + + + +NOCTools2 Working Group [Page 50] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + Public domain, and freely distributable. Available + via anonymous FTP from harbor.ecn.purdue.edu; also has + been posted to comp.sources.unix. The current version + is Version 1.4 from May 1992. + + Contact point: + Dave Curry + Purdue University + Engineering Computer Network + 1285 Electrical Engineering Bldg. + West Lafayette, IN 47907-1285 + davy@ecn.purdue.edu + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Dave Curry (see address above). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 51] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog GETONE_WELLFLEET + + NAME + getone, getmany, getroute, getarp, getaddr, getif, + getid. + + KEYWORDS + manager, routing, status; IP; NMS, SNMP; UNIX. + + ABSTRACT + These commands retrieve and format for display values + of one or several MIB variables (RFC1066) using the + SNMP (RFC1098). Getone and getmany retrieve arbitrary + MIB variables; getroute, getarp, getaddr, and getif + retrieve and display tabular information (routing + tables, ARP table, interface configuration, etc.), and + getid retrieves and displays system name, identifica- + tion and boot time. + + Getone retrieves and displays + the value of the designated MIB variable from the + specified target system. The SNMP community name to be + used for the retrieval can also be specified. Getmany + works similarly for groups of MIB variables rather than + individual values. The name of each variable, its + value and its data type is displayed. Getroute returns + information from the ipRoutingTable MIB structure, + displaying the retrieved information in an accessible + format. Getarp behaves similarly for the address + translation table; getaddr for the ipAddressTable; and + getif displays information from the interfaces table, + supplemented with information from the ipAddressTable. + Getid displays the system name, identification, ipFor- + warding state, and the boot time and date. All take a + system name or IP address as an argument and can + specify an SNMP community for the retrieval. One SNMP + query is performed for each row of the table. + + MECHANISM + Queries SNMP agent(s). + + CAVEATS + None. + + BUGS + None known. + + + + + +NOCTools2 Working Group [Page 52] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Distributed and supported for Sun 3 systems. + + SOFTWARE REQUIRED + Distributed and supported for SunOS 3.5 and 4.x. + + AVAILABILITY + Commercial product of: + Wellfleet Communications, Inc. + 12 DeAngelo Drive + Bedford, MA 01730-2204 + (617) 275-2400 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 53] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog HAMMER_ANVIL + + NAME + hammer & anvil + + KEYWORDS + benchmark, generator; IP; DOS; free. + + ABSTRACT + Hammer and Anvil are the benchmarking programs for IP + routers. Using these tools, gateways have been tested + for per-packet delay, router-generated traffic over- + head, maximum sustained throughput, etc. + + MECHANISM + Tests are performed on a gateway in an isolated + testbed. Hammer generates packets at controlled rates. + It can set the length and interpacket interval of a + packet stream. Anvil counts packet arrivals. + + CAVEATS + Hammer should not be run on a live network. + + BUGS + None reported. + + LIMITATIONS + Early versions of hammer could not produce inter-packet + intervals shorter than 55 usec. + + HARDWARE REQUIRED + Hammer runs on a PC/AT or compatible, and anvil + requires a PC or clone. Both use a Micom Interlan + NI5210 for LAN interface. + + SOFTWARE REQUIRED + MS-DOS. + + AVAILABILITY + Hammer and anvil are copyrighted, though free. Copies + are available from pub/eutil on husc6.harvard.edu. + + + + + + + + + + +NOCTools2 Working Group [Page 54] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog HOPCHECK + + NAME + hopcheck + + KEYWORDS + routing; IP; ping; DOS; free. + + ABSTRACT + Hopcheck is a tool that lists the gateways traversed by + packets sent from the hopcheck-resident PC to a desti- + nation. Hopcheck uses the same mechanism as traceroute + but is for use on IBM PC compatibles that have ethernet + connections. Hopcheck is part of a larger TCP/IP pack- + age that is known as ka9q that is for use with packet + radio. Ka9q can coexist on a PC with other TCP/IP + packages such as FTP Inc's PC/TCP, but must be used + independently of other packages. Ka9q was written by + Phil Karn. Hopcheck was added by Katie Stevens, + dkstevens@ucdavis.edu. Unlike traceroute, which + requires a UNIX kernel mod, hopcheck will run on the + standard, unmodified ka9q release. + + MECHANISM + See the description in traceroute. + + CAVEATS + See the description in traceroute. + + BUGS + None known. + + HARDWARE REQUIRED + IBM PC compatible with ethernet network interface card; + ethernet card supported through FTP spec packet driver. + + SOFTWARE REQUIRED + DOS. + + AVAILABILITY + Free for radio amateurs and educational institutions; + others should contact Phil Karn, karn@ka9q.bellcore.com. + Available via anonymous FTP at ucdavis.edu, in the + directory "dist/nethop". + + + + + + + +NOCTools2 Working Group [Page 55] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog INTERNET_ROVER + + NAME + Internet Rover + + KEYWORDS + status; IP, SMTP; curses, ping, spoof; UNIX; free, + sourcelib. + + ABSTRACT + Internet Rover is a prototype network monitor that uses + multiple protocol "modules" to test network functional- + ity. This package consists of two primary pieces of + code: the data collector and the problem display. + + There is one data collector that performs a series of + network tests, and maintains a list of problems with + the network. There can be many display processes all + displaying the current list of problems which is useful + in a multi-operator NOC. + + The display task uses curses, allowing many terminal + types to display the problem file either locally or + from a remote site. Full source is provided. The data + collector is easily configured and extensible. Contri- + butions such as additional protocol modules, and shell + script extensions are welcome. + + MECHANISM + A configuration file contains a list of nodes, + addresses, NodeUp? protocol test (ping in most cases), + and a list of further tests to be performed if the node + is in fact up. Modules are included to test TELNET, + FTP, and SMTP. If the configuration contains a test + that isn't recognized, a generic test is assumed, and a + filename is checked for existence. This way users can + create scripts that create a file if there is a prob- + lem, and the data collector simply checks the existence + of that file to determine if there is problem. + + CAVEATS + None. + + BUGS + None known. + + + + + + +NOCTools2 Working Group [Page 56] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LIMITATIONS + This tool does not yet have the capability to perform + actions based on the result of the test. Rather, it is + intended for a multi-operator environment, and simply + displays a list of what is wrong with the net. + + HARDWARE REQUIRED + This software is known to run on Suns and IBM RTs. + + SOFTWARE REQUIRED + Curses, 4.xBSD UNIX socket programming libraries, BSD + ping. + + AVAILABILITY + Full source available via anonymous FTP from merit.edu + (35.1.1.42) in the ~ftp/pub/inetrover directory. + Source and executables are public domain and can be + freely distributed for non-commercial use. This pack- + age is unsupported, but bug reports and fixes may be + sent to: wbn@merit.edu. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 57] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog IOZONE + + NAME + iozone + + KEYWORDS + benchmark; nfs;; dos,hp,unix,vmx; free. + + ABSTRACT + Software to assess the sequential file I/O capability + of a system. May be useful as reference to compare + against results obtained when files are accessed via + NFS, Andrew, etc. + + MECHANISM + This test writes a X MEGABYTE sequential file in Y + byte chunks, then rewinds it and reads it back. + [The size of the file should be big enough to factor + out the effect of any disk cache.]. Finally, + IOZONE deletes the temporary file. Options allow one to + vary X and Y. In addition, 'auto test' runs IOZONE + repeatedly using record sizes from 512 to 8192 bytes + (adjustable), and file sizes from 1 to 16 megabytes + (adjustable). It creates a table of results. + + CAVEATS + The file is written (filling any cache buffers), and + then read. If the cache is >= X MB, then most if not + all the reads will be satisfied from the cache. + However, if it is less than or equal to + .5X MB, then NONE of the reads will be satisfied from + the cache. This is becase after the file is written, + a .5X MB cache will contain the upper .5 MB of the + test file, but we will start reading + from the beginning of the file (data which is no + longer in the cache). + + In order for this to be a fair test, the length of the + test file must be AT LEAST 2X the amount of disk cache + memory for your system. If not, you are really + testing the speed at which your CPU + can read blocks out of the cache (not a fair test). + + BUGS + none known at this time. + + + + + + +NOCTools2 Working Group [Page 58] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LIMITATIONS + IOZONE does not normally test the raw I/O speed of + your disk or system-em. It tests the speed of + sequential I/O to actual files. + Therefore, this measurement factors in the efficiency + of you machines file system, operating system, C + compiler, and C runtime library. It produces a + measurement which is the number of bytes + per second that your system can read or write to a file. + + HARDWARE REQUIRED + + This program has been ported and tested on the + following computer operating systems: + +Vendor Operating System Notes on compiling IOzone +----------------------------------------------------------------------- +Apollo Domain/OS no cc switches -- BSD domain +AT&T UNIX System V R4 +AT&T 6386WGS AT&T UNIX 5.3.2 define SYSTYPE_SYSV +Generic AT&T UNIX System V R3 may need cc -DSVR3 +Convergent Unisys/AT&T SVR3 cc -DCONVERGENT -o iozone iozone.c +Digital Equipment ULTRIX V4.1 +Digital Equipment VAX/VMS V5.4 see below ** +Digital Equipment VAX/VMS (POSIX) +Hewlett-Packard HP-UX 7.05 +IBM AIX Ver. 3 rel. 1 +Interactive UNIX System V R3 +Microsoft MS-DOS 3.3 tested Borland, Microsoft C +MIPS RISCos 4.52 +NeXt NeXt OS 2.x +OSF OSF/1 +Portable! POSIX 1003.1-1988 may need to define _POSIX_SOURCE +QNX QNX 4.0 +SCO UNIX System V/386 3.2.2 +SCO XENIX 2.3 +SCO XENIX 3.2 +Silicon Graphics UNIX cc -DSGI -o iozone iozone.c +Sony Microsystems UNIX same as MIPS +Sun Microsystems SUNOS 4.1.1 +Tandem Computers GUARDIAN 90 1. call the source file IOZONEC + 2. C/IN IOZONEC/IOZONE;RUNNABLE + 3. RUN IOZONE +Tandem Computers Non-Stop UX + +** for VMS, define iozone as a foreign command via this DCL command: + + $IOZONE :== $SYS$DISK:[]IOZONE.EXE + + + +NOCTools2 Working Group [Page 59] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + this lets you pass the command line arguments to IOZONE + + SOFTWARE REQUIRED + OS as shown in the hardware listing above. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + Author: Bill Norcott + 1060 Hyde Avenue + San Jose, CA 95129 + norcott_bill@tandem.com + + Availability: + This tool has been posted to comp.sources.misc. + It is available from the usual archive sites. + Program can be located using ARCHIE or other + servers. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + This entry is maintained by the noctools editors. + Send email to noctools-request@merit.edu. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 60] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog LADDIS + + NAME + LADDIS + + KEYWORDS + benchmark, generator; + NFS; + spoof; + unix; + free. + + ABSTRACT + + "LADDIS: A Multi-Vendor and Vendor-Neutral SPEC NFS + Benchmark", Bruce Nelson, LADDIS Group & Auspex Systems. + + Over the past 24 months, engineers from Legato, + Auspex, Data General, DEC, Interphase, and Sun + (LADDIS) met regularly to create the LADDIS NFS + benchmark: an unbiased, standard, vendor-independent, + scalable NFS performance test. + + The purpose of the LADDIS benchmark is to give users a + credible and undisputed test of NFS performance, and + to give vendors a publishable standard performance + measure that customers can use for load planning, + system configuration, and equipment buying decisions. + Toward this end, the LADDIS benchmark is being adopted + by SPEC (the System Performance Evaluation + Cooperative, creators of SPECmarks) as the first + member of SPEC's System-level File Server (SFS) + benchmark suite." + + "In particular, we have had unexpected interest from + some router vendors in using LADDIS to both rate and + stress-test IP routers. This is because LADDIS can + send back-to-back full-size packet trains, and because + it can generate a 90%-Ethernet util on simulated + "real" NFS workloads, just like routers encounter in + the real world. But LADDIS is for local Ethernet or + FDDI nets only, not WAN." + + MECHANISM + Generates NFS requests and measures responsiveness of + the server. + + + + + +NOCTools2 Working Group [Page 61] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + "LADDIS is not released yet by SPEC, although a free + beta version, quite stable, is available now as + PRE-LADDIS. So you might want to put PRE-LADDIS in + your listing, noting that full LADDIS + availability from SPEC is expected by the end of 1992." + + BUGS + The licensee is requested to direct beta test comments + via electronicmail to: + "spec-preladdis-comments@riscee.pko.dec.com". + + This alias will forward all comments to the SPECSFS + mailing list (which includes the LADDIS Group). + + LIMITATIONS + LADDIS is for local Ethernet or FDDI nets only, not + WAN. + + HARDWAE REQUIRED + A host with LAN connectivity. Presumably, a host with + enough horsepower to generate an adequate work load. + + SOFTWARE REQUIRED + LADDIS is a sophisticated Unix-based NFS traffic + generator program. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + Date: Mon, 10 Feb 92 13:12:20 PST + From: bnelson (Bruce Nelson) + + Dear Person: + + The SPEC PRE-LADDIS beta test process became + operational on Monday, February 3, 1992. This email + describes the process as announced during the LADDIS + Group's presentation at UniForum '92 and + also at Interop '91. The content of the beta test + license and the license request process are consistent + with the proposals approved by the SPEC Steering + Committee at the January 1992 meeting in Milpitas, + California. + + The SPEC PRE-LADDIS beta test will consist of one beta + test version of PRE-LADDIS distributed ONLY by + electronic mail. The SPEC PRE-LADDIS Beta test + software is licensed by SPEC, not by the LADDIS + Group. + + + +NOCTools2 Working Group [Page 62] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + To obtain the PRE-LADDIS Beta test software, an + individual must: + + 1. Request the SPEC PRE-LADDIS beta test License by + electronic mail to + "spec-preladdis-beta-test@riscee.pko.dec.com" with a + subject line of "Request SPEC PRE-LADDIS Beta Test + License". + 2. Print a hardcopy of the license and sign. + 3. Attach a cover letter written on the individual's + company letterhead requesting the PRE-LADDIS Beta + Test Kit. + 4. U.S. Mail the signed license and cover letter to: + SPEC PRE-LADDIS Beta Test + c/o NCGA, 2722 Merrilee Drive, Suite 200 + Fairfax, VA 22031 + + After completing these steps, the SPEC PRE-LADDIS beta + test kit will be emailed to the requestor from + riscee.pko.dec.com. The licensee is requested to + direct beta test comments via electronic mail + to "spec-preladdis-comments@riscee.pko.dec.com". This + alias will forward all comments to the SPECSFS mailing + list (which includes the + LADDIS Group). + + Note that PRE-LADDIS is ONLY available through + electronic mail and ONLY through the process listed + above in steps 1-4. If you do not have internet email + available to you (which is unlikely if you are + receiving THIS email), you must arrange delivery of + PRE-LADDIS through some email-capable part of your + organization, not through LADDIS members like Auspex, + DEC, Sun, etc. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + This entry is maintained by the NOCtools editors. + Send E-mail to noctools-request@merit.edu. + + + + + + + + + + + + + +NOCTools2 Working Group [Page 63] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog LAN_PATROL + + NAME + LAN Patrol + + KEYWORDS + security, traffic; ethernet, star; eavesdrop; DOS. + + ABSTRACT + LAN Patrol is a full-featured network analyzer that + provides essential information for effective fault and + performance management. It allows network managers to + easily monitor user activity, find traffic overloads, + plan for growth, test cable, uncover intruders, balance + network services, and so on. LAN Patrol uses state of + the art data collection techniques to monitor all + activity on a network, giving an accurate picture of + how it is performing. + + LAN Patrol's reports can be saved as ASCII files to + disk, and imported into spreadsheet or database pro- + grams for further analysis. + + MECHANISM + The LAN Patrol interface driver programs a standard + interface card to capture all traffic on a network seg- + ment. The driver operates from the background of a + standard PC, maintaining statistics for each station on + the network. The information can be viewed on the PC's + screen, or as a user-defined report output either to + file or printer. + + CAVEATS + None. Normal operation is completely passive, making + LAN Patrol transparent to the network. + + BUGS + None known. + + LIMITATIONS + LAN Patrol can monitor up to 10,000 packets/sec on an + AT class PC, and is limited to monitoring a maximum of + 1024 stations for intervals of up to 30 days. + + Because LAN Patrol operates at the physical level, it + will only see traffic for the segment on which it is + installed; it cannot see traffic across bridges. + + + + +NOCTools2 Working Group [Page 64] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HARDWARE REQUIRED + Computer: IBM PC/XT/AT, PS/2 Model 30, or compatible. + Requires 512K memory and a hard drive or double-sided + disk drive. + + Display: Color or monochrome text. Color display + allows color-coding of traffic information. + + Ethernet, StarLAN, LattisNet, or StarLAN 10 network + interface card. + + SOFTWARE REQUIRED + PC DOS, MS-DOS version 3.1 or greater. + + AVAILABILITY + LAN Patrol many be purchased through network dealers, + or directly from: + Legend Software, Inc. + Phone: (201) 227-8771 + FAX: (201) 906-1151 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 65] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog LANVista + + NAME + LANVista + + KEYWORDS + analyzer, benchmark, debugger, generator, manager, traffic; + DECnet, Ethernet, IP, OSI, Ring; Eavesdrop, Proprietary; + DOS, Standalone. + + ABSTRACT + CXR/Digilog's LANVista family of protocol and statistical + analyzers provide the tools to troubleshoot an Ethernet and + Token Ring 4/16Mbps network. LANVista lets you capture + frames to RAM and or disk, generate traffic for stress + testing, test your network cable for fault isolation, and + decode all 7 layers of many popular protocol stacks. + LANVista's 100 family offers exceptional price/performance + and a wide range of options. Combined with an + integrated upgrade path to the fully distributed LANVista + 200 system, the 100 line provides a reasonably priced + entry into LAN management and protocol analysis. + + All LANVista models are fully operable under Microsoft + Windows. Under Windows, LANVista can be operated in + the background, gathering data and alarms as other + tasks are completed. Displayed data may easily be + cut from LANVista and pasted into other Windows + applications such as Excel, Lotus 1-2-3, Harvard + Graphics, etc. + + The versatile LANVista family can also be remotely + controlled through the use of PC Anywhere, Commute, + Carbon Copy, or other PC remote control packages. + This feature allows the use of "co-pilot" mode which + enables an operator at the central site to guide and + train a remote operator through network management or + analysis tasks. + + All LANVista models provide features vital to effective + network management and troubleshooting. Basic + capabilities include: Network database, statistics + based on the entire network and on a node basis, Token + Ring functional address statistics, Bridged traffic + statistics, Protocol statistics, logging of statistics + to a printer or file of user definable alarms, Hardware + Pre-Capture filtering, Post capture filtering, Playback of + captured data, Traffic simulation and On-line context + + + +NOCTools2 Working Group [Page 66] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + sensitive Help. + + Protocol Interpreters used for decoding network traffic + supported by LANVista include: TCP/IP, DECnet, Banyan + Vines, XNS/MS-Net, AppleTalk, IBM Token Ring, Novell, + 3Com 3+ Open, SNMP and OSI. + + MECHANISM + LANVista is available in three forms. A kit version which + consists of a plug-in PC card and Master software, a self + contained unit that packages the kit version in a portable + PC, and a Distributed system. The LANVista distributed + system allows slave units placed anywhere in the world to + be controlled from a single central location for + centralized management of an enterprise network. + LANVista's PC cards provides a physical interface to + the LAN and frame preprocessing power. The Master + software controls the PC card, and the display and + processing of information gathered from the network. + + CAVEATS + Optimal performance of LANVista's master software is achieved + with DOS 5.0 by utilizing RAMDRIVE.SYS, SMARTDRV.SYS and High + memory. + + BUGS + None Known. + + LIMITATIONS + None Known. + + HARDWARE REQUIRED + IBM PC AT, 386, 486 or compatible. + + SOFTWARE REQUIRED + DOS + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + LANVista is available worldwide. For information on a + local sales representative contact: + + CXR/DIGILOG + 900 Business Center Drive + Horsham, PA 19044 + Phone 1-800-DIGILOG + FAX: 215-956-0108 + + GSA schedule pricing is honored. + + + +NOCTools2 Working Group [Page 67] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + CXR/DIGILOG Help Desk 1-800-DIGILOG + Send email to: lanvista@digilog.uucp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 68] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog LANPROBE + + NAME + LanProbe -- the HP 4990S LanProbe Distributed Analysis + System. + + KEYWORDS + alarm, manager, map, status, traffic; ethernet; eaves- + drop, NMS; proprietary. + + ABSTRACT + The LanProbe distributed monitoring system performs + remote and local monitoring of ethernet LANs in a pro- + tocol and vendor independent manner. + + LanProbe discovers each active node on a segment and + displays it on a map with its adapter card vendor name, + ethernet address, and IP address. Additional informa- + tion about the nodes, such as equipment type and physi- + cal location can be entered in to the data base by the + user. + + When the NodeLocator option is used, data on the actual + location of nodes is automatically entered and the map + becomes an accurate representation of the physical lay- + out of the segment. Thereafter when a new node is + installed and becomes active, or when a node is moved + or becomes inactive, the change is detected and shown + on the map in real time. The system also provides the + network manager with precise cable fault information + displayed on the map. + + Traffic statistics are gathered and displayed and can + be exported in (comma delimited) CSV format for further + analysis. Alerts can be set on user defined thres- + holds. + + Trace provides a remote protocol analyzer capability + with decodes for common protocols. + + Significant events (like power failure, cable breaks, + new node on network, broadcast IP source address seen, + etc.) are tracked in a log that is uploaded to Pro- + beView periodically. + + ProbeView generates reports that can be manipulated by + MSDOS based word processors, spreadsheets, and DBMS. + + + + +NOCTools2 Working Group [Page 69] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + The system consists of one or more LanProbe segment + monitors and ProbeView software running under Microsoft + Windows. The LanProbe segment monitor attaches to the + end of an ethernet segment and monitors all traffic. + Attachment can be direct to a thin or thick coax cable, + or via an external transceiver to fiber optic or twist- + ed pair cabling. Network data relating to the segment + is transferred to a workstation running ProbeView via + RS-232, ethernet, or a modem connection. + + ProbeView software, which runs on a PC/AT class works- + tation, presents network information in graphical + displays. + + The HP4992A NodeLocator option attaches to the opposite + end of the cable from the HP4991A LanProbe segment mon- + itor. It automatically locates the position of nodes + on the ethernet networks using coaxial cabling schemes. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + HP 4991A LanProbe segment monitor + HP 4992A NodeLocator (for optional capabilities) + 80386 based PC capable of running MS-Windows + + SOFTWARE REQUIRED + HP 4990A ProbeView + MSDOS 3.0 or higher and Microsoft Windows/286 2.1. + + AVAILABILITY + A commercial product available from: + Hewlett-Packard Company + P.O. Box 10301, + Palo Alto, CA 94303-0890 + + + + + + + + +NOCTools2 Working Group [Page 70] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog LANWATCH + + NAME + LANWatch + + KEYWORDS + alarm, analyzer, traffic; CHAOS, DECnet, DNS, ethernet, + IP, OSI, ring, SMTP, star; eavesdrop; DOS; library, + sourcelib. + + ABSTRACT + LANWatch 2.0 is an inexpensive, powerful and flexible + network analyzer that runs under DOS on personal com- + puters and requires no hardware modifications to either + the host or the network. LANWatch is an invaluable + tool for installing, troubleshooting, and monitoring + local area networks, and for developing and debugging + new protocols. Network managers using LANWatch can + inspect network traffic patterns and packet errors to + isolate performance problems and bottlenecks. Protocol + developers can use LANWatch to inspect and verify + proper protocol handling. Since LANWatch is a + software-only package which installs easily in existing + PCs, network technicians and field service engineers + can carry LANWatch in their briefcase for convenient + network analysis at remote sites. + + LANWatch has two operating modes: Display and Examine. + In Display Mode, LANWatch traces network traffic by + displaying captured packets in real time. Examine Mode + allows you to scroll back through stored packets to + inspect them in detail. To select a subset of packets + for display, storage or retrieval, there is an exten- + sive set of built-in filters. Using filters, LANWatch + collects only packets of interest, saving the user from + having to sort through all network traffic to isolate + specific packets. The built-in filters include alarm, + trigger, capture, load, save and search. They can be + controlled separately to match on source or destination + address, protocol, or packet contents at the hardware + and transport layers. LANWatch also includes suffi- + cient source code so users can modify the existing + filters and parsers or add new ones. + + The LANWatch distribution includes executables and + source for several post-processors: a TCP protocol + analyzer, a node-by-node traffic analyzer and a dump + file listing tool. + + + +NOCTools2 Working Group [Page 71] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + Uses many common PC network interfaces by placing them + in promiscuous mode and capturing traffic. + + CAVEATS + Most PC network interfaces will not capture 100% of the + traffic on a fully-loaded network (primarily missing + back-to-back packets). + + BUGS + None known. + + LIMITATIONS + LANWatch can't analyze what it doesn't see (see + Caveats). + + HARDWARE REQUIRED + LANWatch requires a PC or PS/2 with a supported network + interface card. + + SOFTWARE REQUIRED + LANWatch runs in DOS. Modification of the supplied + source code or creation of additional filters and + parsers requires Microsoft C 5.1 + + AVAILABILITY + LANWatch is commercially available from FTP Software, + Incorporated, 26 Princess Street, Wakefield, MA, 01880 + (617 246-0900). + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 72] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog LLL_ENTM + + NAME + ENTM -- Ethernet Traffic Monitor + + KEYWORDS + traffic; ethernet, IP; eavesdrop; VMS; free. + + ABSTRACT + ENTM is a screen-oriented utility that runs under + VAX/VMS. It monitors local ethernet traffic and + displays either a real time or cumulative, histogram + showing a percent breakdown of traffic by ethernet pro- + tocol type. The information in the display can be + reported based on packet count or byte count. The per- + cent of broadcast, multicast and approximate lost pack- + ets is reported as well. The screen display is updated + every three seconds. Additionally, a real time, slid- + ing history window may be displayed showing ethernet + traffic patterns for the last five minutes. + + ENTM can also report IP traffic statistics by packet + count or byte count. The IP histograms reflect infor- + mation collected at the TCP and UDP port level, includ- + ing ICMP type/code combinations. Both the ethernet and + IP histograms may be sorted by ASCII protocol/port name + or by percent-value. All screen displays can be saved + in a file for printing later. + + MECHANISM + This utility simply places the ethernet controller in + promiscuous mode and monitors the local area network + traffic. It preallocates 10 receive buffers and + attempts to keep 22 reads pending on the ethernet dev- + ice. + + CAVEATS + Placing the ethernet controller in promiscuous mode may + severly slow down a VAX system. Depending on the speed + of the VAX system and the amount of traffic on the lo- + cal ethernet, a large amount of CPU time may be spent + on the Interrupt Stack. Running this code on any pro- + duction system during operational hours is discouraged. + + BUGS + Due to a bug in the VAX/VMS ethernet/802 device driver, + IEEE 802 format packets may not always be detected. A + simple test is performed to "guess" which packets are + + + +NOCTools2 Working Group [Page 73] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + in IEEE 802 format (DSAP equal to SSAP). Thus, some + DSAP/SSAP pairs may be reported as an ethernet type, + while valid ethernet types may be reported as IEEE 802 + packets. + + In some hardware configurations, placing an ethernet + controller in promiscuous mode with automatic-restart + enabled will hang the controller. Our VAX 8650 hangs + running this code, while our uVAX IIs and uVAX IIIs do + not. + + Please report any additional bugs to the author at: + Allen Sturtevant + National Magnetic Fusion Energy Computer Center + Lawrence Livermore National Laboratory + P.O. Box 808; L-561 + Livermore, CA 94550 + Phone : (415) 422-8266 + E-Mail: sturtevant@ccc.nmfecc.gov + + LIMITATIONS + The user is required to have PHY_IO, TMPMBX and NETMBX + privileges. When activated, the program first checks + that the user process as enough quotas remaining + (BYTLM, BIOLM, ASTLM and PAGFLQUO) to successfully run + the program without entering into an involuntary wait + state. Some quotas require a fairly generous setting. + + The contents of IEEE 802 packets are not examined. + Only the presence of IEEE 802 packets on the wire is + reported. + + The count of lost packets is approximated. If, after + each read completes on the ethernet device, the utility + detects that it has no reads pending on that device, + the lost packet counter is incremented by one. + + When the total number of bytes processed exceeds + 7fffffff hex, all counters are automatically reset to + zero. + + HARDWARE REQUIRED + A DEC ethernet controller. + + SOFTWARE REQUIRED + VAX/VMS version V5.1+. + + + + + +NOCTools2 Working Group [Page 74] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + For executables only, FTP to the ANONYMOUS account + (password GUEST) on CCC.NMFECC.GOV and GET the follow- + ing files: + + [ANONYMOUS.PROGRAMS.ENTM]ENTM.DOC (ASCII text) + [ANONYMOUS.PROGRAMS.ENTM]ENTM.EXE (binary) + [ANONYMOUS.PROGRAMS.ENTM]EN_TYPES.DAT (ASCII text) + [ANONYMOUS.PROGRAMS.ENTM]IP_TYPES.DAT (ASCII text) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 75] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog Interactive Network Map + + NAME + map -- Interactive Network Map + + KEYWORDS + manager, map; CHAOS, ethernet, IP, ring, star; NMS, + ping, SNMP, X; UNIX; free, sourcelib. + + ABSTRACT + Map draws a map of network connectivity and allows + interactive examination of information about various + components including whether hosts can be reached over + the network. + + The program is supplied with complete source and is + written in a modular fashion to make addition of dif- + ferent protocols stacks, displays, or hardcopy devices + relatively easy. This is one of the reasons why the + initial version supports at least two of each. Contri- + butions of additional drivers in any of these areas + will be welcome as well as porting to additional plat- + forms. + + MECHANISM + Net components are pinged by use of ICMP echo and, + optionally, CHAOS status requests and SNMP "gets." The + program initializes itself from static data stored in + the file system and therefore does not need to access + the network in order to get running (unless the static + files are network mounted). + + CAVEATS + As of publication, the tool is in beta release. + + BUGS + Several minor nits, documented in distribution files. + Bug discoveries should be reported by email to Bug- + Map@LCS.MIT.Edu. + + LIMITATIONS + See distribution file for an indepth discussion of sys- + tem capabilities and potential. + + HARDWARE REQUIRED + An X display is needed for interactive display of the + map, non-graphical interaction is available in non- + display mode. For hardcopy output a PostScript or Tek- + + + +NOCTools2 Working Group [Page 76] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + tronix 4692 printer is required. + + SOFTWARE REQUIRED + BSD UNIX or related OS. IP/ICMP is required; + CHAOS/STATUS and SNMP can be used but are optional. + X-Windows is required for interactive display of the + map. + + AVAILABILITY + The program is Copyright MIT. It is available via + anonymous FTP with a license making it free to use and + distribute for non-commercial purposes. FTP to host + FTP.LCS.MIT.Edu, directory nets. The complete + distribution is in map.tar.Z and some short + documentation files are there (as well as in the + distribution). Of most interest are ReadMe and Intro. + + To be added to the email forum that discusses the + software, or for other administrative details, send a + request to: MAP-Request@LCS.MIT.Edu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 77] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog MCONNECT + + NAME + mconnect + + KEYWORDS + status; SMTP; spoof; UNIX. + + ABSTRACT + Mconnect allows an interactive session with a remote + mailer. Mail delivery problems can be diagnosed by + connecting to the remote mailer and issuing SMTP com- + mands directly. + + MECHANISM + Opens a TCP connection to remote SMTP on port 25. Pro- + vides local line buffering and editing, which is the + distinction between mconnect and a TELNET to port 25. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + Mconnect is not a large improvement over using a TELNET + connection to port 25. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX or related OS. + + AVAILABILITY + Available with 4.xBSD UNIX and related operating sys- + tems. + + + + + + + + + + + + + +NOCTools2 Working Group [Page 78] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog MIB-BROWSER + + NAME + MIB Browser + + KEYWORDS + manager; osi; cmis, x; unix; free, sourcelib. + + ABSTRACT + The MIB Browser is an X Windows HCI tool that allows + you to "browse" through the objects in a Management + Information Base (MIB). The browser is generic in that + it can connect to a CMIS agent without having any + prior knowledge of the structure of the MIB in the + agent. + + MECHANISM + CMIP is used to transfer the values of attributes + between the managed system and the browser. + + CAVEATS + None. + + BUGS + Unexpected termination of the agent can cause browser + to crash (ISODE bug!). + + HARDWARE REQUIRED + Unix workstation, has been tested on SUN 3 and SUN 4 + architectures. + + SOFTWARE REQUIRED + The ISODE protocol suite, BSD UNIX, X Windows, GNU C++ + (g++), Interviews (2.6). + + AVAILABILITY + The CMIP library and related management tools built + upon it, known as OSIMIS (OSI Management Information + Service), are publicly available from University + College London, England via FTP and FTAM. To obtain + information regarding a copy send email to + osimis-request@cs.ucl.ac.uk or call +44 71 380 7366. + + + + + + + + + +NOCTools2 Working Group [Page 79] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog MONET + + NAME + MONET -- the Hughes LAN Systems SNMP Network Management + Center (formerly the Hughes LAN Systems 9100) software + product runs on a Sun SPARCStation hardware platform. + + KEYWORDS + control, graphics, network topology,manager, routing, + status, traffic; bridge, configuration, performance, + alarm management, relational database, mib parser for + RDBMS, intelligent hub management, DECnet, ethernet, + IP; NMS, SNMP; UNIX. + + ABSTRACT + Monet provides the capability to manage and control + SNMP-based networking products from any vendor including + those from Hughes LAN Systems. + + A comprehensive relational database manages the data and + ensures easy access and control of resources throughout + the network. + + Monet provides multivendor management through its + advanced Mib master MIB parser that allows the parsing + of enterprise MIBs (ASN.1 format per RFC1212) directly + into the RDBMS for use by Monet's applications. + + Major features include: + + Remote access with X: + Use of the X/Motif user-interface, enabling remote + access to the all applications. + + Database Management + Stores and retrieves the information required to + administer and configure the network. It can be + used to: + - Store and recall configuration data for all + devices. + - Provide availability history for devices. + - Assign new internet addresses. + - Provide administrative information such as + physical location of devices, responsible + person, maintenance history, asset data, + hardware/software versions, etc. + - Full-function SQL interface. + - User-customizable RDBMS report generation. + + + +NOCTools2 Working Group [Page 80] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Graphics and Network Mapping + The Graphics module enables the user to view the + nodes in the network as "dynamic" icons in + heirarchical maps. The network is represented by + these heirarchical maps. Though there is a + library of device icons, cities and geographical + maps included, the user has access to a + graphics editor that allows customizing and the + creation of new icons and maps. + A Device's icon may be selected to: + - Register/deregister the device, + - Access the open alarms and acknowledge + faults for the selected device, + - Ping the device to determine accessibility, + - Draw graphs of any of the device's numeric + MIB objects, either the values as retrieved + in real-time or the history values + previously stored in the RDBMS by the + Performance Manager, + - Telnet to the device, + - Customize the graphical dynamics (color, + fill, rotation, etc.) of the device's icon + by associating them to the values of the + device's MIB objects. + + Configuration Management + - Retrieves configuration information from SNMP + devices. + - Stores device parameters in the RDBMS, with + common sets of parameters used for multiple + devices, or for multiple ports on a device, + stored only once in the RDBMS. + - Configures devices from the parameters stored in + the RDBMS, including those relating to TCP/IP, + DECnet and any other protocol/feature + configurable via SNMP. + - Polls devices to compare their current parameter + values with those in the database and produce + reports of the discrepancies. + - Collect data about the state of the network. + - Learn the parameters of the devices in the + network and populate the database. + + Performance Management + - Displays local network traffic graphically, by + packet size, protocol, network utilization, + sources and destinations of packets, etc. + - Provides for the scheduling of jobs to retrieve + + + +NOCTools2 Working Group [Page 81] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MIB values of a device and store them in the RDBMS + for review or summary reporting at a later time. + - Allows high/low thresholds to be set on retrieved + values with alarms generated when thresholds are + exceeded. + + Fault Management + - Provides availability monitoring and indicates + potential problems. + - Creates alarms from received SNMP traps, and from + other internally-generated conditions, + - Records alarms in the alarm log in the RDBMS. + - Lists alarms for selected set of devices, + according to various filter conditions, + - Possible causes and suggested actions for the + alarms are listed. + - New alarms are indicated by a flashing icon and + optional audio alert. + - Visual indication of alarms bubbles up the network + map heirarchy. + - Cumulative reports can be produced. + + Utilities Function + - View and/or terminate current NMC processes, + - Access to database maintenance utilities. + + MECHANISM + SNMP. + + CAVEATS + None reported. + + BUGS + None known. + + LIMITATIONS + Maximum number of nodes that can be monitored is + 18,000. This can include Hosts, Terminal Servers, PCs, + Routers, and Bridges. + + HARDWARE REQUIRED + The host for the NMC software is a Sun 4 desktop works- + tation. Recommended minimum hardware is the Sun IPX + Color workstation, with a 1/4" SCSI tape drive. + + SOFTWARE REQUIRED + MONET V5.0, which is provided on 1/4" tape format, runs on + the Sun 4.1.1 Operating System. + + + +NOCTools2 Working Group [Page 82] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + A commercial product of: + Hughes LAN Systems Inc. + 1225 Charleston Road + Mountain View, CA 94043 + Phone: (415) 966-7300 + Fax: (415) 960-3738 + RCA Telex: 276572 + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + kishoret@msgate.hls.com + kzm@hls.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 83] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NET_MONITOR + + NAME + net_monitor + + KEYWORDS + routing, status; DECnet, IP; curses, ping; UNIX, VMS; + free, sourcelib. + + ABSTRACT + Net_monitor uses ICMP echo (and DECnet reachability + information on VAX/VMS) to monitor a network. The mon- + itoring is very simplistic, but has proved useful. It + periodically tests whether hosts are reachable and + reports the results in a full-screen display. It + groups hosts together in common sets. If all hosts in + a set become unreachable, it makes a lot of racket with + bells, since it assumes that this means that some com- + mon piece of hardware that supports that set has + failed. The periodicity of the tests, hosts to test, + and groupings of hosts are controlled with a single + configuration file. + + The idea for this program came from the PC/IP monitor + facility, but is an entirely different program with + different functionality. + + MECHANISM + Reachability is tested using ICMP echo facilities for + TCP/IP hosts (and DECnet reachability information on + VAX/VMS). A DECnet node is considered reachable if it + appears in the list of hosts in a "show network" com- + mand issued on a routing node. + + CAVEATS + This facility has been found to be most useful when run + in a window on a workstation rather than on a terminal + connected to a host. It could be useful if ported to a + PC (looks easy using FTP Software's programming + libraries), but this has not been done. Curses is very + slow and cpu intensive on VMS, but the tool has been + run in a window on a VAXstation 2000. Just don't try + to run it on a terminal connected to a 11/750. + + BUGS + None known. + + + + + +NOCTools2 Working Group [Page 84] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LIMITATIONS + This tool is not meant to be a replacement for a more + comprehensive network management facility such as is + provided with SNMP. + + HARDWARE REQUIRED + A host with a network connection. + + SOFTWARE REQUIRED + Curses, 4.xBSD UNIX socket programming libraries (lim- + ited set) and some flavor of TCP/IP that supports ICMP + echo request (ping). It has been run on VAX/VMS run- + ning WIN/TCP and several flavors of 4BSD UNIX (includ- + ing SunOS 3.2, 4.0, and 4.3BSD). It could be ported to + any platform that provides a BSD-style programming li- + brary with an ICMP echo request facility and curses. + + AVAILABILITY + Requests should be sent to the author: + + Dale Smith + Asst Dir of Network Services + University of Oregon + Computing Center + Eugene, OR 97403-1211 + + Internet: dsmith@oregon.uoregon.edu. + BITNET: dsmith@oregon.bitnet + UUCP: ...hp-pcd!uoregon!dsmith + Voice: (503)686-4394 + + With the source code, a makefile is provided for most + any UNIX box and a VMS makefile compatible with the + make distributed with PMDF. A VMS DCL command file is + also provided, for use by those VMS sites without + "make." + + The author will attempt to fix bugs, but no support is + promised. The tool is copyrighted, but free (for now). + + + + + + + + + + + + +NOCTools2 Working Group [Page 85] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETLABS_CMOT_AGENT + + NAME + Netlabs CMOT Agent + + KEYWORDS + manager, status; IP, OSI; NMS. + + ABSTRACT + Netlabs' CMOT code debuted in Interop 89. The CMOT + code comes with an Extensible MIB, which allows users + to add new MIB variables. The code currently supports + all the MIB variables in RFC 1095 via the data types in + RFC 1065, as well as the emerging MIB-II, which is + currently in experimental stage. The CMOT has been + benchmarked at 100 Management Operations per Second + (MOPS) for a 1-MIPS machine. + + MECHANISM + The Netlabs CMOT agent supports the control and moni- + toring of network resources by use of CMOT message + exchanges. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Portable to most hardware. + + SOFTWARE REQUIRED + Portable to most operating systems. + + AVAILABILITY + Commercially available from: + Netlabs Inc + 11693 Chenault Street Ste 348 + Los Angeles CA 90049 + (213) 476-4070 + lam@netlabs.com (Anne Lam) + + + + + + +NOCTools2 Working Group [Page 86] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETLABS_DUAL_MANAGER + + NAME + Dual Manager + + KEYWORDS + alarm, control, manager, map, security, status; IP, + OSI; NMS, SNMP, X; UNIX; library. + + ABSTRACT + Netlabs' Dual Manager provides management of TCP/IP + networks using both SNMP and CMOT protoocls. Such + management can be initiated either through the X- + Windows user interface (both Motif and Openlook), or + through OSI Network Management (CMIP) commands. The + Dual Manager provides for configuration, fault, secu- + rity and performance management. It provides extensive + map management features, including scanned maps in the + background. It provides simple mechanisms to extend + the MIB and assign specific lists of objects to + specific network elements, thereby providing for the + management of all vendors' specific MIB extensions. It + provides an optional relational DBMS for storing and + retrieving MIB and alarm information. Finally, the + Dual Manager is an open platform, in that it provides + several Application Programming Interfaces (APIs) for + users to extend the functionality of the Dual Manager. + + The Dual Manager is expected to work as a TCP/IP + "branch manager" under DEC's EMA, AT&T's UNMA and other + OSI-conformant enterprise management architectures. + + MECHANISM + The Netlabs Dual Manager supports the control and moni- + toring of network resources by use of both CMOT and + SNMP message exchanges. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Runs on Sun/3 and Sun/4s. + + + +NOCTools2 Working Group [Page 87] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SOFTWARE REQUIRED + Available on System V or SCO Open Desktop environments. + Uses X-Windows for the user interface. + + AVAILABILITY + Commercially available from: + Netlabs Inc + 11693 Chenault Street Ste 348 + Los Angeles CA 90049 + (213) 476-4070 + lam@netlabs.com (Anne Lam) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 88] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETLABS_SNMP_AGENT + + NAME + Netlabs SNMP Agent. + + KEYWORDS + manager, status; IP; NMS, SNMP. + + ABSTRACT + Netlabs' SNMP code debuted in Interop 89, where it + showed interoperation of the code with several imple- + mentations on the show floor. The SNMP code comes with + an Extensible MIB, which allows users to add new MIB + variables. The code currently supports all the MIB + variables in RFC 1066 via the data types in RFC 1065, + as well as the emerging MIB-II, which is currently in + experimental stage. The SNMP has been benchmarked at + 200 Management Operations per Second (MOPS) for a 1- + MIPS machine. + + MECHANISM + The Netlabs SNMP agent supports the control and moni- + toring of network resources by use of SNMP message + exchanges. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Portable to most hardware. + + SOFTWARE REQUIRED + Portable to most operating systems. + + AVAILABILITY + Commercially available from: + Netlabs Inc + 11693 Chenault Street Ste 348 + Los Angeles CA 90049 + (213) 476-4070 + lam@netlabs.com (Anne Lam) + + + + +NOCTools2 Working Group [Page 89] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NetMetrix-Load-Monitor + + NAME + NetMetrix Load Monitor + + KEYWORDS + alarm,traffic; Ethernet, FDDI, IP, Ring; Eavesdrop, + SNMP, X; UNIX; + + ABSTRACT + The NetMetrix Load Monitor is a distributed + client-server monitoring tool for ethernet, token + ring, and FDDI networks. A unique "dual" architecture + provides compatibility with both RMON and X windows. + RMON allows interoperability and an enterprise-wide + view, while X windows enables much more powerful, + intelligent applications at remote segments and saves + network bandwidth. + + The Load Monitor provides extensive traffic + statistics. It looks at load by time interval, source + node, destination node, application, protocol or + packet size. A powerful ZOOM feature allows extensive + correlational analysis which is displayed in a wide + variety of graphs and tables. + + You can answer questions such as: Which sources are + generating most of the load on the network when it is + most heavily loaded and where is this load going? + Which source/destination pairs generate the most + traffic over the day? Where should bridges and + routers be located to optimally partition the network? + How much load do applications, like the X Windows + protocol, put on the network and who is generating that + load when it is the greatest. + + A floating license allows easy access to the software + tool anywhere you need it. + + MECHANISM + NetMetrix turns the network interface into promiscuous + mode to capture packets. + + CAVEATS + none. + + BUGS + none known. + + + +NOCTools2 Working Group [Page 90] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LIMITATIONS + none. + + HARDWARE REQUIRED + SPARC system + + SOFTWARE REQUIRED + SunOS 4.0 or higher + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + NetMetrix is available from: + Sales Department + Metrix Network Systems, Inc. + One Tara Boulevard + Nashua, New Hampshire 03062 + telephone: 603-888-7000 + fax: 603-891-2796 + email: info@metrix.com + + Government agencies please note that NetMetrix is on the GSA + schedule. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Norma Shepperd + Marketing Administrator + 603-888-7000 + norma@metrix.com + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 91] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NetMetrix-NFS-Monitor + + NAME + NetMetrix NFS Monitor + + KEYWORDS + traffic; Ethernet, FDDI, NFS, Ring; Eavesdrop, SNMP, X; + UNIX + + ABSTRACT + The NetMetrix NFS Monitor is a distributed network + monitoring tool which monitors and graphs NFS load, + response time, retransmits, rejects and errors by + server, client, NFS procedure, or time + interval. Breakdown server activity by file system + and client activity by user. + + A powerful ZOOM feature lets you correlate monitoring + variables. You can see client/server relationships, + compare server performance, evaluate NFS performance + enhancement strategies. + + A floating license and the X Window protocol allows + monitoring of remote ethernet, token ring and FDDI + segments from a central enterprise-wide display. + + MECHANISM + NetMetrix turns the network interface into promiscuous + mode to capture packets. + + CAVEATS + none. + + BUGS + none known. + + LIMITATIONS + none. + + HARDWARE REQUIRED + SPARC system + + SOFTWARE REQUIRED + SunOS 4.0 or higher + + + + + + + +NOCTools2 Working Group [Page 92] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + NetMetrix is available from: + Sales Department + Metrix Network Systems, Inc. + One Tara Boulevard + Nashua, New Hampshire 03062 + telephone: 603-888-7000 + fax: 603-891-2796 + email: info@metrix.com + + Government agencies please note that NetMetrix is on + the GSA schedule. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Norma Shepperd + Marketing Administrator + 603-888-7000 + norma@metrix.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 93] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NetMetrix-Protocol-Analyzer + + NAME + NetMetrix Protocol Analyzer + + KEYWORDS + alarm, analyzer, traffic; DECnet, DNS, Ethernet, FDDI, + IP, OSI, NFS, Ring, SMTP; Eavesdrop, SNMP, X; UNIX; + Library + + ABSTRACT + The NetMetrix Protocol Analyzer is a distributed + client-server monitoring tool for ethernet, token + ring, and FDDI networks. A unique "dual" architecture + provides compatibility with both RMON and + X windows. RMON allows interoperability, while X + windows enables much more powerful, intelligent + applications at remote segments and saves network + bandwidth. + + With the Protocol Analyzer, you can decode and display + packets as they are being captured. Extensive filters + let you sift through packets either before or after + trace capture. The capture filter may be specified by + source, destination between hosts, protocol, packet + size, pattern match, or by a complete expression using + an extensive filter expression language. + + Full 7-layer packet decodes are available for all + major protocols including DECnet, Appletalk, Novell, + XNS, SNA, BANYAN, OSI and TCP/IP. The decodes for the + TCP/IP stack have all major protocols including NFS, + YP, DNS, SNMP, OSPF, etc. + + Request and reply packets are matched. Packets can be + displayed in summary, detail or hex, with multiple + views to see packet dialogues side by side. + + A complete developers' kit is available for custom + decodes. + + A floating license allows easy acess to the software + tool anywhere you need it. + + MECHANISM + NetMetrix turns the network interface into promiscuous + mode to capture packets. + + + + +NOCTools2 Working Group [Page 94] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + none. + + BUGS + none known. + + LIMITATIONS + none. + + HARDWARE REQUIRED + SPARC system + + SOFTWARE REQUIRED + SunOS 4.0 or higher + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + NetMetrix is available from: + Sales Department + Metrix Network Systems, Inc. + One Tara Boulevard + Nashua, New Hampshire 03062 + telephone: 603-888-7000 + fax: 603-891-2796 + email: info@metrix.com + + Government agencies please note that NetMetrix is on the + GSA schedule. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Norma Shepperd + Marketing Administrator + 603-888-7000 + norma@metrix.com + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 95] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NetMetrix-Traffic-Generator + + NAME + NetMetrix Traffic Generator + + KEYWORDS + Debugger, Generator, Traffic; Ethernet, FDDI, IP, + Ring; Eavesdrop, SNMP, X; UNIX; Library + + ABSTRACT + The NetMetrix Traffic Generator is a distributed + software tool which allows you to simulate network + load or test packet dialogues between nodes on your + ethernet, token ring, or FDDI segments. The Traffic + Generator can also be used to test and validate + management station alarms, routers, bridges, hubs, etc. + + An easy-to-use programming interface provides complete + flexibility over variables such as bandwidth, packet + sequence, and conditional responses. + + A floating license and the X Window System protocol + allows testing of remote ethernet, token ring and FDDI + segments from a central console. + + MECHANISM + NetMetrix turns the network interface into promiscuous + mode to capture packets. + + CAVEATS + none. + + BUGS + none known. + + LIMITATIONS + none. + + HARDWARE REQUIRED + SPARC system + + SOFTWARE REQUIRED + SunOS 4.0 or higher + + + + + + + + +NOCTools2 Working Group [Page 96] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + NetMetrix is available from: + Sales Department + Metrix Network Systems, Inc. + One Tara Boulevard + Nashua, New Hampshire 03062 + telephone: 603-888-7000 + fax: 603-891-2796 + email: info@metrix.com + + Government agencies please note that NetMetrix is on + the GSA schedule. + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Norma Shepperd + Marketing Administrator + 603-888-7000 + norma@metrix.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 97] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETMON_MITRE + + NAME + NETMON and iptrace + + KEYWORDS + traffic; IP; eavesdrop; UNIX; free. + + ABSTRACT + NETMON is a facility to enable communication of net- + working events from the BSD UNIX operating system to a + user-level network monitoring or management program. + Iptrace is a program interfacing to NETMON which logs + TCP-IP traffic for performance measurement and gateway + monitoring. It is easy to build other NETMON-based + tools using iptrace as a model. + + NETMON resides in the 4.3BSD UNIX kernel. It is + independent of hardware-specific code in UNIX. It is + transparent to protocol and network type, having no + internal assumptions about the network protocols being + recorded. It is installed in BSD-like kernels by + adding a standard function call (probe) to a few points + in the input and output routines of the protocols to be + logged. + + NETMON is analogous to Sun Microsystems' NIT, but the + interface tap function is extended by recording more + context information. Aside from the timestamp, the + choice of information recorded is up to the installer + of the probes. The NETMON probes added to the BSD IP + code supplied with the distribution include as context: + input and output queue lengths, identification of the + network interface, and event codes labeling packet dis- + cards. (The NETMON distribution is geared towards + measuring the performance of BSD networking protocols + in an IP gateway). + + NETMON is designed so that it can reside within the + monitored system with minimal interference to the net- + work processing. The estimated and measured overhead + is around five percent of packet processing. + + The user-level tool "iptrace" is provided with NETMON. + This program logs IP traffic, either at IP-level only, + or as it passes through the network interface drivers + as well. As a separate function, iptrace produces a + host traffic matrix output. Its third type of output + + + +NOCTools2 Working Group [Page 98] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + is abbreviated sampling, in which only a pre-set number + of packets from each new host pair is logged. The + three output types are configured dynamically, in any + combination. + + OSITRACE, another logging tool with a NETMON interface, + is available separately (and documented in a separate + entry in this catalog). + + MECHANISM + Access to the information logged by NETMON is through a + UNIX special file, /dev/netmon. User reads are blocked + until the buffer reaches a configurable level of full- + ness. + + Several other parameters of NETMON can be tuned at com- + pile time. A diagnostic program, netmonstat, is + included in the distribution. + + CAVEATS + None. + + BUGS + Bug reports and questions should be addressed to: + ie-tools@gateway.mitre.org + Requests to join this mailing list: + ie-tools-request@gateway.mitre.org + Questions and suggestions can also be directed to: + Allison Mankin (703)883-7907 + mankin@gateway.mitre.org + + LIMITATIONS + A NETMON interface for tcpdump and other UNIX protocol + analyzers is not included, but it is simple to write. + NETMON probes for a promiscuous ethernet interface are + similarly not included. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX-like network protocols or the ability to + install the BSD publicly available network protocols in + the system to be monitored. + + + + + + + +NOCTools2 Working Group [Page 99] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + The NETMON distribution is available by anonymous FTP + in pub/netmon.tar or pub/netmon.tar.Z from aelred- + 3.ie.org. A short user's and installation guide, + NETMON.doc, is available in the same location. The + NETMON distribution is provided "as is" and requires + retention of a copyright text in code derived from it. + It is copyrighted by the MITRE-Washington Networking + Center. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 100] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETMON_WINDOWS_SNMP_RESEARCH + + NAME + NETMON for Windows -- an SNMP-based network management + tool that runs under Microsoft Windows 3.0 from SNMP + Research. + + KEYWORDS + alarm, control, manager, map, routing; + DECnet, Ethernet, IP, OSI, ring, star; + NMS, SNMP; + DOS; + sourcelib. + + ABSTRACT + The NETMON application implements a powerful network + management station based on a low-cost DOS platform. + NETMON's network management tools for configuration, + performance, security, and fault management have been + used successfully with a wide assortment of wide- and + local-area-network topologies and medias. Multiprotocol + devices are supported including those using TCP/IP, + DECnet, and OSI protocols. + + Some features of NETMON's network management tools include: + + o Fault management tool displays a map of the network + configuration with node and link state indicated + in one of several colors to indicate current status; + o Configuration management tool may be used to edit the + network management information base stored in the + NMS to reflect changes occurring in the network; + o Graphs and tabular tools for use in fault and performance + management; + o Mechanisms by which additional variables, such as vendor- + specific variables, may be added; + o Alarms may be enabled to alert the operator of events + occurring in the network; + o Events are logged to disk; + o Output data may be transferred via flat files for + additional report generation by a variety of + statistical packages. + + The NETMON application comes complete with source code + including a powerful set of portable libraries for generating + and parsing SNMP messages. + + + + + +NOCTools2 Working Group [Page 101] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + The NETMON for Windows application is based on the + Simple Network Management Protocol (SNMP). Polling is + performed via the powerful SNMP get-next operator and + the SNMP get operator. Trap directed polling is used + to regulate the focus and intensity of the polling. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + The minimum system is a IBM 386 computer, or + compatible, with hard disk drive. + + SOFTWARE REQUIRED + DOS 5.0 or later, Windows 3.0 in 386 mode, and TCP/IP + kernel software from FTP Software. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 102] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETscout + + NAME + NETscout(tm) + + KEYWORDS + Alarm, Analyzer, Manager, Status, Traffic; + DECnet, Ethernet, IP, OSI, NFS, Ring, Star, Eavesdrop; + NMS, SNMP; + UNIX; + + ABSTRACT + The NETscout family of distributed LAN Analyzer + devices are intended to provide network users with a + comprehensive capability to identify and isolate fault + conditions in data communications networks. + NETscout has the capability to collect wide ranging + statistical data, to display selectively captured and + fully decoded network traffic, to set user-defined + alarm conditions, and to obtain real-time updates + from all segments of a widely dispersed internetwork + from a centralized SNMP-compatible network management + console. + + The NETscout family is based on standards so that + operation may be realized in heterogeneous networks + which constitute a multi-protocol, multi-topology, + multi-vendor environment. The fundamental standards + upon which NETscout is based are the Simple Network + Management Protocol (SNMP), which defines the protocol + for all inter-communications between NETscout devices, + and the Remote Monitoring Management Information Base + (RMON-MIB), which defines the type of information + which is to be gathered and made available to the + user for each network segment. + + NETscout clients provide a full array of monitoring + and analysis features including intelligent seven + level decoding of all majorprotocol stacks: + + DOD including TCP/IP XNS Novell + DECNET including LAT ISO APPLETALK + IBM Token Ring Vines NETBIOS/SMB + SNMP including RMON-MIB SUN-NFS SMT + + NETscout agents support all nine groups of the + RMON-MIB standard. NETscout agents can work with any + SNMP-based network management system and currently + + + +NOCTools2 Working Group [Page 103] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + support Ethernet and Token Ring. + + MECHANISM + The operation of the NETscout family is divided into + two distinct subcategories. The first is the "Client" + which is the user console from which operational + commands are issued and where all results and + diagnostic information are displayed. In a NETscout + topology it is feasible to have multiple clients + active simultaneously within a single network. The + second category is the "Agent", a hardware/software + device which is attached to a specific network + segment and which gathers statistical information for + that segment as well as providing a window into that + segment where network traffic may be observed and + gathered for more detailed user analysis. A + typical network will have multiple segments and + multiple agents up to the point of having one agent + for each logical network segment. + + NETscout Model 9210 is a software package which, when + combined in a Sun SPARCstation in conjunction with + SunNet Manager running under Open Windows, implements + the NETscout client function. SunNet Manager provides + the background operational tools for client operation + while the NETscout software provides + application-specific functions related to RMON-MIB + support as well as all software necessary to + perform the protocol decode function. + SunNet Manager also implements a network map file + which includes a topographical display of the entire + network and is the mechanism for selecting + network elements to perform operations. + + NETscout Model 9215 is a software package that + operates in conjunction with SunNet Manager and + implements the statistics monitoring function only. + That is, it does not include the protocol + decode function or the mechanism to retrieve actual + data from a remote agent. It does, however, include + complete statistics gathering and event and alarm + generation. + + Frontier NETscout Models 9510 and 9515, and Model 9610 + and 9615 are agent software packages that implement + selected network diagnostic functions when loaded into + a Sun SPARCstation (9510, 9515) or a SynOptics + LattisNet Hub (9610, 9615) respectively which is + + + +NOCTools2 Working Group [Page 104] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + connected to an Ethernet network segment + using conventional network interface hardware. Models + 9510 and 9610 support all nine RMON-MIB groups + including "filters" and "packet capture" and thus + provide for complete protocol monitoring and decode + when used with a client + equipped with protocol decode software. Models 9515 + an 9615 include support for seven RMON-MIB groups + which excludes "filters" and "data capture" and + therefore perform network monitoring only through + collection and presentation of network statistics, + events, and alarms. All models also support the MIB2 + system and interface groups. + + Frontier NETscout Models 9520 and 9525, and Model 9620 + and 9625 are agent software packages that are + identical in function to their respective models + described above except that they are for use on + Token Ring segments. + + CAVEATS + The RMON-MIB standard for Token Ring applications has + not yet beenformally released and is not approved. + NETscout products correspond to the latest draft for + Token Ring functions and will be updated as + required to conform to the standard as it is approved. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Sun SPARCstation or LattisNet Hub depending upon Model + number. + + SOFTWARE REQUIRED + Sun OS 4.1.1 for client and agent, SunNet Manager for + client. + + + + + + + + + + + +NOCTools2 Working Group [Page 105] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + NETscout products are available commercially. For + information regarding your local representative, contact: + Frontier Software Development, Inc. + 1501 Main Street + Tewksbury, MA 01876 + Phone: 508-851-8872 + Fax: 508-851-6956 + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Marketing + Frontier Software + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 106] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETSTAT + + NAME + netstat + + KEYWORDS + routing; IP; UNIX, VMS; free. + + ABSTRACT + Netstat is a program that accesses network related data + structures within the kernel, then provides an ASCII + format at the terminal. Netstat can provide reports on + the routing table, TCP connections, TCP and UDP + "listens", and protocol memory management. + + MECHANISM + Netstat accesses operating system memory to read the + kernel routing tables. + + CAVEATS + Kernel data structures can change while netstat is run- + ning. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX or related OS, or VMS. + + AVAILABILITY + Available via anonymous FTP from uunet.uu.net, in + directory bsd-sources/src/ucb. Available with 4.xBSD + UNIX and related operating systems. For VMS, available + as part of TGV MultiNet IP software package, as well as + Wollongong's WIN/TCP. + + + + + + + + + + +NOCTools2 Working Group [Page 107] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NETWORK_INTEGRATOR + + NAME + Network Integrator I + + KEYWORDS + map, traffic; ethernet; UNIX. + + ABSTRACT + This tool monitors traffic on network segments. All + information is dumped to either a log file or, for + real-time viewing, to a command tool window. Data is + time-stamped according to date and time. Logging can + continue for up to 24 hours. + + The tool is flexible in data collection and presenta- + tion. Traffic filters can be specified according to + header values of numerous protocols, including those + used by Apple, DEC, Sun, HP, and Apollo. Bandwidth + utilization can be monitored, as well as actual load + and peak throughput. Additionally, the Network + Integrator can analyze a network's topology, and record + the location of all operational nodes on a network. + + Data can be displayed in six separate formats of bar + graphs. In addition, there are several routines for + producing statistical summaries of the data collected. + + MECHANISM + The tools work through RPC and XDR calls. + + CAVEATS + Although the tool adds only little traffic to a net- + work, generation of statistics from captured files + requires a significant portion of a workstation's CPU. + + BUGS + None known. + + LIMITATIONS + Must be root to run monitor. There does not seem to be + a limit to the number of nodes, since it monitors by + segments. The only major limitation is the amount of + disk space that a user can commit to the log files. + The size of the log files, however, can be controlled + through the tool's parameters. + + + + + +NOCTools2 Working Group [Page 108] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HARDWARE REQUIRED + Sun3 or Sun4. + + SOFTWARE REQUIRED + 4.0BSD UNIX or greater, or related OS. + + AVAILABILITY + Copyrighted, commercially available from + Network Integrators, + (408) 927-0412. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 109] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NFSwatch + + NAME + nfswatch + + KEYWORDS + Traffic; Ethernet, IP, NFS; Curses, Eavesdrop; UNIX; + Free + + ABSTRACT + Nfswatch monitors all incoming ethernet traffic to an + NFS file server and divides it into several + categories. The number and percentage of packets + received in each category is displayed on + the screen in a continuously updated display. + + By default, nfswatch monitors all packets destined for + the local host over a single network interface. + Options are provided to specify the specific interface + to be monitored, or all interfaces at once. NFS + traffic to the local host, to a remote host, from a + specific host, between two hosts, or all NFS traffic + on the network may be monitored. + + Categories of packets monitored and counted include: + ND Read, ND Write, NFS Read, NFS Write, NFS Mount, + Yellow Pages (NIS), RPC Authorization, Other RPC, TCP, + UDP, ICMP, RIP, ARP, RARP, Ethernet Broadcast, and + Other. + + Packets are also tallied either by file system or file + (specific files may be watched as an option), NFS + procedure name (RPC call), or NFS client hostname. + + Facilities for taking "snapshots" of the screen, as + well as saving data to a log file for later analysis + (the analysis tool is included) are also available. + + MECHANISM + Nfswatch uses the Network Interface Tap, nit(4) under + SunOS 4.x, and the Packet Filter, packetfilter(4), + under Ultrix 4.x, to place the ethernet interface into + promiscuous mode. It filters out NFS packets, and + decodes the file handles in order to determine how to + count the packet. + + + + + + +NOCTools2 Working Group [Page 110] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + Because the NFS file handle is a non-standard (server + private) piece of data, nfswatch must be modified to + understand file handles used by various + implementations. It currently knows + about the SunOS 4.x and Ultrix file handle formats. + + BUGS + Does not monitor FDDI interfaces. (It should be a + simple change, but neither author has access to a + system with FDDI interfaces for testing.) + + LIMITATIONS + Up to 256 exported file systems and 256 individual + files can be monitored at any time. + + Only NFS requests are counted; the NFS traffic + generated by a server in response to those packets + is not counted. + + HARDWARE REQUIRED + Any Ultrix system (VAX or DEC RISC hardware) + + SOFTWARE REQUIRED + Ultrix release 4.0 or later. For Ultrix 4.1, may + require the patched "if_ln.o" kernel module, available + from Digital's Customer Support Center. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + Copyrighted, but freely distributable. Available via + anonymous FTP from harbor.ecn.purdue.edu, + ftp.erg.sri.com, and gatekeeper.dec.com, as well as + numerous other sites around the Internet. The current + version is Version 3.0 from January 1991. + + Contact points: + + Dave Curry Jeff Mogul + Purdue University Digital Equipment Corp. + Engineering Computer Network Western Research Laboratory + 1285 Electrical Engineering Bldg. 100 Hamilton Avenue + West Lafayette, IN 47907-1285 Palo Alto, CA 94301 + davy@ecn.purdue.edu mogul@decwrl.dec.com + + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Dave Curry (see address above). + + + + +NOCTools2 Working Group [Page 111] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NHFSSTONE + + NAME + nhfsstone + + KEYWORDS + benchmark, generator; NFS; spoof; UNIX; free. + + ABSTRACT + Nhfsstone (pronounced n-f-s-stone, the "h" is silent) + is an NFS benchmarking program. It is used on an NFS + client to generate an artificial load with a particular + mix of NFS operations. It reports the average response + time of the server in milliseconds per call and the + load in calls per second. The nhfsstone distribution + includes a script, "nhfsnums" that converts test + results into plot(5) format so that they can be graphed + using graph(1) and other tools. + + MECHANISM + Nhfsstone is an NFS traffic generator. It adjusts its + calling patterns based on the client's kernel NFS + statistics and the elapsed time. Load can be generated + over a given time or number of NFS calls. + + CAVEATS + Nhfsstone will compete for system resources with other + applications. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + 4.xBSD-based UNIX + + + + + + + + + + + +NOCTools2 Working Group [Page 112] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + Available via anonymous FTP from bugs.cs.wisc.edu. + Alternatively, Legato Systems will provide the program + free of charge, if certain conditions are met. Send + name and both email and U.S. mail addresses to: + Legato Systems, Inc. + Nhfsstone + 260 Sheridan Avenue + Palo Alto, California 94306 + + A mailing list is maintained for regular information + and bug fixes: nhfsstone@legato.com or + uunet!legato.com!nhfsstone. To join the list: + nhfsstone-request@legato.com or + uunet!legato.com!nhfsstone-request. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 113] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NNSTAT + + NAME + NNStat + + KEYWORDS + manager, status, traffic; ethernet, IP; eavesdrop, NMS; + UNIX; free. + + ABSTRACT + NNStat is a collection of programs that provides an + internet statistic collecting capability. The NNStat + strategy for statistic collection is to collect traffic + statistics via a promiscuous ethernet tap on the local + networks, versus instrumenting the gateways. If all + traffic entering or leaving a network or set of net- + works traverses a local ethernet, then by stationing a + statistic gathering agent on each local network a pro- + file of network traffic can be gathered. Statistical + data is retrieved from the local agents by a global + manager. + + A program called "statspy" performs the data gathering + function. Essentially, statspy reads all packets on an + ethernet interface and records all information of + interest. Information of interest is gathered by exa- + mining each packet and determining if the source or + destination IP address is one that is being monitored, + typically a gateway address. If so then the contents + of the packet are examined to see if they match further + criteria. + + A program called "collect" performs global data collec- + tion. It periodically polls various statspy processes + in the domain of interest to retrieve locally logged + statistical data. + + The NNSTAT distribution comes with several sample awk + programs which process the logged output of the collect + program. + + MECHANISM + Local agents (statspy processes) collect raw traffic + data via a promiscuous ethernet tap. Statistical, fil- + tered or otherwise reduced data is retrieved from the + local agents by a global manager (the "collect" pro- + cess). + + + + +NOCTools2 Working Group [Page 114] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + None. + + BUGS + Bug fixes, extensions, and other pointers are discussed + in the electronic mail forum, bytecounters. To join, + send a request to bytecounters-request@venera.isi.edu. + Forum exchanges are archived in the file + bytecounters/bytecounters.mail, available via anonymous + FTP from venera.isi.edu. + + LIMITATIONS + NNStat presumes a topology of one or more long haul + networks gatewayed to local ethernets. + + A kernel mod required to run with SunOS4. These mods + are described in the bytecounters archive. + + HARDWARE REQUIRED + Ethernet interface. Sun 3, Sun 4 (SPARC), or PC RT + workstation. + + SOFTWARE REQUIRED + Distribution is for BSD UNIX, could easily be adapted + to any UNIX with promiscuous ethernet support. + + AVAILABILITY + Distribution is available via anonymous FTP from + venera.isi.edu, in file pub/NNStat.tar.Z. Documenta- + tion is in pub/NNStat.userdoc.ms.Z. + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 115] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NOCOL(8) + + NAME + nocol - network monitoring tools for an IP network + + SYNOPSIS + This is an overview of the NOCOL software. + + DESCRIPTION + NOCOL (Network Operations Center On-Line) is a + collection of network monitoring programs that run on + Unix systems. The software consists of a number of + monitoring agents that poll various parameters from any + system and put it in a format suitable for + post-processing. The post-processors can be a display + agent, an automated troubleshooting program, an + event logging program, etc. Presently, monitors for + tracking reachability, SNMP traps, data throughput + rate, and nameservers have been developed and are in + use. Addition of more monitoring agents is easy and + they will be added as necessary. A display agent- + nocol(1) using curses has already been developed. Work + on an "intelligent" module is currently in progress for + event logging and some automatic troubleshooting. + + All data collected by the monitoring agents follows a + fixed (non-readable) format. Each data entry is termed + an event in NOCOL, and each event has certain flags and + severity associated with it. The display agent + nocol(1), displays the output of these monitoring + agents depending on the severity of the event. There + can be multiple displays running simultanously and + all process the same set of monitored data. + + There are four levels of severity associated with an + event- CRITICAL, ERROR, WARNING and INFO. The severity + level is controlled independently by the monitoring + agents, and the decision to raise or set an event's + severity to any level depends on the logic imbedded in + the monitoring agent. + + As an example, for the pingmon(8) monitor, if a site is + unreachable via ping, it would be assigned a severity + of WARNING by pingmon, which would then elevate to + CRITICAL if the site is still unreachable after some + time. In the case of trapmon(8), an SNMP trap message + of EGP neighbor lost would be directly assigned a + severity level of CRITICAL, while an Warm Start trap is + + + +NOCTools2 Working Group [Page 116] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + assigned a severity of WARNING. + + The display agent (and other data post-processors) + would use this event severity to decide whether to + display it (or troubleshoot/log it) depending on the + user selected display severity level. + + The software is very flexible and allows enhancements + and development with a minimum amount of effort. The + display module processes all the files present in the + data directory, and displays them sequentially. This + allows new monitoring programs to simply start + generating data in the data directory and the display + module will automatically start displaying the new + data. The monitoring tools can be changed, and the only + element that has to remain common between all the + modules is the EVENT data structure. + + CURRENT MODULES + NOCOL presently consists of the following modules: + + nocol + which simply displays the data collected by the + monitoring agents. It uses the curses screen + management system to support a wide variety of terminal + types. The criterion for displaying an event is: + + 1. Severity level of the event is higher than the + severity level set in the display. + + 2. The display filter (if set) matches some string in + the event line. + + The display can be in regular 80 column mode or in + extended 132 column mode. Critical events are + displayed in reverse video (if the terminal type + supports it). Additional features like displaying + informational messages in a part of the window, + automatic resizing window sizes, operator + acknowledgement via a bell when a new event goes + critical are also available. + + ippingmon + which monitors the reachability of a site via "ICMP" + ping packets (ICMP was preferred over SNMP for many + obvious reasons). This program can use the default out- + put from the system's ping program, but an accompanying + program ( multiping) can ping multiple IP sites at the + + + +NOCTools2 Working Group [Page 117] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + same time and is preferable for monitoring a large list + of sites. A site is marked unreachable if a certain + number of packets is lost, and the severity level is + increased each time that the site tests unreachable. + + osipingmon + which is similar to the ippingmon module but uses the + OSI ping program instead. No multiple ping program for + OSI sites has been developed at this time. The only + requirement is that the system's ping program output + match the typical BSD IP ping program's output. + + nsmon + which monitors the nameservers (named) on the list of + specified hosts. It periodically sends an SOA query for + the default domain and if the queried nameservers + cannot resolve the query, then the site is elevated to + CRITICAL status. + + tpmon + For monitoring the throughput (kbits per second) to a + list of hosts. The program connects to the discard + socket on the remote machine (using a STREAM socket) + and sends large packets for a small amount of time to + evaluate the effective throughput. It elevates a site + to WARNING level if the throughput drops below a + certain threshold (set in the configuration file). + + trapmon + Converts all SNMP traps into a format suitable for + displaying using NOCOL. The severity of the various + traps is preset (and can be changed during compilation + time). + + + PLATFORM + Any Unix system with the curses screen management library + and IP (Internet Protocol) programming facility. It has been + tested on Sun Sparc 4.1.1, Ultrix, and NeXT systems. Porting + to other platforms might require minor adjustments depending + on the vagaries of the different vendors (mostly in the + include files). + + AVAILABILITY + NOCOL was developed at JvNCnet and has been in use for + monitoring the JvNCnet wide area network since 1989. + It is available via anonymous FTP from ftp.jvnc.net under + pub/jvncnet-packages/nocol.tar.Z. The system running at + + + +NOCTools2 Working Group [Page 118] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + JvNCet can be viewed by logging into the host nocol.jvnc.net + with username nocol (an rlogin instead of telnet will handle + your X window terminal types better). + To be added to the NOCOL mailing list (for future updates + and bug fixes), send a message to nocol-users- + request@jvnc.net with your email address. + + FUTURE DEVELOPMENTS + + Possible future enhancements are: + + 1. Event logging. + + 2. Addition of an automated troubleshooting mechanism + when a site severity level reaches a particular + level. + + 3. SNMP monitors to watch the state of certain vari- + ables (interface errors, packet rate, route state + changes). + + AUTHOR + The software was developed at JvNCnet over a period of time. + The overall design and initial development was done by Vikas + Aggarwal and Sze-Ying Wuu. Additional development is being + done and coordinated by Vikas Aggarwal (vikas@jvnc.net). + Copyright 1992 JvNCnet. (See the file COPYRIGHT for full + details) + + SEE ALSO + nocol(1) nocol(3) tpmon(8) tsmon(8) nsmon(8) + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 119] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NPRV + + NAME + NPRV -- IP Node/Protocol Reachability Verifier + + KEYWORDS + map, routing, status; IP; ping; VMS; free. + + ABSTRACT + NPRV is a full-screen, keypad-oriented utility that + runs under VAX/VMS. It allows the user to quickly scan + through a user-defined list of IP addresses (or domain + names) and verify a node's reachability. The node's + reachability is determined by performing an ICMP echo, + UDP echo and a TCP echo at alternating three second + intervals. The total number of packets sent and + received are displayed, as well as the minimum, average + and maximum round-trip times (in milliseconds) for each + type of echo. Additionally, a "trace route" function + is performed to determine the path from the local sys- + tem to the remote host. Once all of the trace route + information has filled the screen, a "snapshot" of the + screen can be written to a text file. Upon exiting the + utility, these text files can be used to generate a + logical network map showing host and gateway intercon- + nectivity. + + MECHANISM + The ICMP echo is performed by sending ICMP ECHO REQUEST + packets. The UDP and TCP echoes are performed by con- + necting to the UDP/TCP echo ports (port number 7). The + trace route information is compiled by sending alter- + nating ICMP ECHO REQUEST packets and UDP packets with + very large destination UDP port numbers (in two + passes). Each packet is initially sent with a TTL + (time to live) of 1. This should cause an ICMP TIME + EXCEEDED error to be generated by the first routing + gateway. Then each packet is sent with a TTL of 2. + This should cause an ICMP TIME EXCEEDED error to be + generated by the second routing gateway. Then each + packet is sent with a TTL of 3, and so on. This pro- + cess continues until an ICMP ECHO REPLY or UDP PORT + UNREACHABLE is received. This indicates that the + remote host has been reached and that the trace route + information is complete. + + + + + + +NOCTools2 Working Group [Page 120] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + This utility sends one echo packet per second (ICMP, + UDP or TCP), as well as sending out one trace route + packet per second. If a transmitted trace route packet + is returned in less than one second, another trace + route packet is sent in 100 milliseconds. This could + cause a significant amount of contention on the local + network. + + BUGS + None known. Please report any discovered bugs to the + author at: + Allen Sturtevant + National Magnetic Fusion Energy Computer Center + Lawrence Livermore National Laboratory + P.O. Box 808; L-561 + Livermore, CA 94550 + Phone : (415) 422-8266 + E-Mail: sturtevant@ccc.nmfecc.gov + + LIMITATIONS + The user is required to have SYSPRV privilege to per- + form the ICMP Echo and trace route functions. The + utility will still run with this privilege disabled, + but only the UDP Echo and TCP Echo information will be + displayed. This utility is written in C, but unfor- + tunately it cannot be easily ported over to UNIX since + many VMS system calls are used and all screen I/O is + done using the VMS Screen Management Routines. + + HARDWARE REQUIRED + Any network interface supported by TGV Incorporated's + MultiNet software. + + SOFTWARE REQUIRED + VAX/VMS V5.1+ and TGV Incorporated's MultiNet version + 2.0. + + AVAILABILITY + For executables only, FTP to the ANONYMOUS account + (password GUEST) on CCC.NMFECC.GOV (128.55.128.30) and + GET the following files: + + [ANONYMOUS.PROGRAMS.NPRV]NPRV.DOC (ASCII text) + [ANONYMOUS.PROGRAMS.NPRV]NPRV.EXE (binary) + [ANONYMOUS.PROGRAMS.NPRV]SAMPLE.IPA (ASCII text) + + + + + +NOCTools2 Working Group [Page 121] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog NSLOOKUP + + NAME + nslookup + + KEYWORDS + status; DNS, BIND; UNIX, VMS; free. + + ABSTRACT + Nslookup is an interactive program for querying + Internet Domain Name System (DNS) servers. It is + essentially a user-friendly front end to + the BIND "resolver" library routines. + + This program is useful for converting a hostname + into an IP address (and vice versa), determining + the name servers for a domain , listing + the contents of a domain, displaying any type of + DNS record, such as MX, CNAME, SOA, etc., + diagnosing name server problems. + + By default, nslookup will query + the default name server but you can specify a + different server on the command line or from a + configuration file. You can also specify + different values for the options that control the + resolver routines. + + MECHANISM + The program formats, sends and receives DNS + (RFC 1034) queries. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None known. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX or related OS, or VMS. + + + + + +NOCTools2 Working Group [Page 122] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + NSLookup is included in the BIND distribution. + + Available via anonymous FTP from uunet.uu.net, + in directory /networking/ip/dns/bind. Available + with 4.xBSD UNIX and related operating systems. + For VMS, available as part of TGV MultiNet IP + software package, as well as Wollongong's WIN/TCP. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 123] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog OSITRACE + + NAME + OSITRACE + + KEYWORDS + traffic; OSI; eavesdrop; UNIX; free. + + ABSTRACT + OSITRACE is a network performance tool that displays + information about ISO TP4 connections. One line of + output is displayed for each packet indicating the + time, source, destination, length, packet type, + sequence number, credit, and any optional parameters + contained in the packet. Numerous options are avail- + able to control the output of OSITRACE. + + To obtain packets to analyze, OSITRACE uses Sun + Microsystems' Network Interface Tap (NIT) in SunOS 3.4, + 3.5, and 4.0.X. OSITRACE may also obtain data from the + NETMON utility which is described as another tool + entry. + + In Sun systems, OSITRACE may be easily installed: OSI + kernel support is not needed, nor is any other form of + OSI software support. + + MECHANISM + This tool has been designed in such a way that code to + process different protocol suites may be easily added. + As such, OSITRACE also has the ability to trace the DOD + TCP protocols. + + CAVEATS + None. + + BUGS + Bug reports and questions should be addressed to: ie- + tools@gateway.mitre.org + + Requests to join this mailing list: ie-tools- + request@gateway.mitre.org + + Questions and suggestions can also be directed to: Greg + Hollingsworth, gregh@gateway.mitre.org + + LIMITATIONS + None reported. + + + +NOCTools2 Working Group [Page 124] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HARDWARE REQUIRED + No restriction. + + SOFTWARE REQUIRED + SunOS 3.4, 3.5, or 4.0.X, or BSD UNIX-like network pro- + tocols with NETMON installed. + + AVAILABILITY + OSITRACE is copyrighted by the MITRE-Washington Net- + working Center, but freely distributed "as is." It re- + quires retention of a copyright text in code derived + from it. The distribution is available by anonymous + FTP in pub/pdutrace.tar or pub/pdutrace.tar.Z from + aelred-3.ie.org. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 125] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog OVERVIEW + + NAME + OverVIEW + + KEYWORDS + manager, status; IP; NMS, SNMP; DOS. + + ABSTRACT + Network and internet monitor; Performance monitor; + Fully Graphic user interface; Event logging; TFTP boot + server + + MECHANISM + OverVIEW uses SNMP to query routers, gateways and + hosts. Also supports SGMP, PING and is committed to + CMIP/CMOT. The SNMP queries allow dynamic determina- + tion of configuration and state. Sets of related + queries allows monitoring of congestion and faults. + The hardware and software are sold as an integrated + package. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + 256 nodes, 256 nets + + HARDWARE REQUIRED + 80286, 640K, EGA, mouse. + + SOFTWARE REQUIRED + MS-DOS, OverVIEW, Network kernel, Mouse driver, SNMP + agents for monitored devices. + + AVAILABILITY + Fully supported product of Proteon, Inc. For more + information, contact: + Proteon, Inc. Phone: (508) 898-2800 + 2 Technology Drive Fax: (508) 366-8901 + Westborough, MA 01581 Telex: 928124 + + + + + + + +NOCTools2 Working Group [Page 126] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog PING + + NAME + ping + + KEYWORDS + generator, status; IP; ping; DOS, UNIX, VMS; free. + + ABSTRACT + Ping is perhaps the most basic tool for internet + management. It verifies that a remote IP implementa- + tion and the intervening networks and interfaces are + functional. It can be used to measure round trip + delay. Numerous versions of the ping program exist. + + MECHANISM + Ping is based on the ICMP ECHO_REQUEST message. + + CAVEATS + If run repeatedly, ping could generate high system + loads. + + BUGS + None known. + + LIMITATIONS + PC/TCP's ping is the only implementation known support + both loose and strict source routing. Though some ping + implementations support the ICMP "record route" + feature, the usefulness of this option for debugging + routes is limited by the fact that many gateways do not + correctly implement it. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + None. + + AVAILABILITY + Ping is widely included in TCP/IP distributions. Pub- + lic domain versions of ping are available via anonymous + FTP from uunet.uu.net, in directory bsd- + sources/src/etc, and from venera.isi.edu, in directory + pub. + + + + + + +NOCTools2 Working Group [Page 127] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog PROCESS-TCPWARE-SNMP + + NAME + SNMP agent + + KEYWORDS + alarm, manager, status, traffic; IP; SNMP; VMS;. + + ABSTRACT + The SNMP agent listens for and responds to network + management requests sent from SNMP-conforming network + management stations. The SNMP agent also sends SNMP + traps, under specific conditions, to identified trap + receivers. SNMP communities and generation of traps + are fully configurable. The SNMP agent supports all + MIB-II variables except the EGP group. + + MECHANISM + Network management variables are made available for + inspection and/or alteration by means of the Simple + Network Management Protocol (SNMP). + + CAVEATS + None. + + BUGS + No known bugs. + + LIMITATIONS + Does not yet provide the ability for sites to add + extra MIB definitions. + + HARDWARE REQUIRED + Supported VAX processors. + + SOFTWARE REQUIRED + VMS V4 or later + + AVAILABILITY + The SNMP agent is included in TCPware for VMS, a + commercial product available under license from: + Process Software Corporation + 959 Concord Street + Framingham, MA 01701 + +1 800 722 7770, +1 508 879 6994 (voice) + +1 508 879-0042 (FAX) TELEX 517891 + sales@process.com + + + + +NOCTools2 Working Group [Page 128] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog PROXYD + + NAME + proxyd -- SNMP proxy agent daemons from SNMP Research. + + KEYWORDS + control, management, status; + bridge, Ethernet, IP, OSI, ring, star; + NMS, SNMP; + UNIX; + library, sourcelib. + + ABSTRACT + SNMP proxy agents may be used to permit the monitoring + and controlling of network elements which are otherwise + not addressable using the SNMP management protocol + (e.g., a network bridge that implements a proprietary + management protocol). Similarly, SNMP proxy agents may + be used to protect SNMP agents from redundant network + management agents through the use of caches. Finally, + SNMP proxy agents may be used to implement elaborate + MIB access policies. + + The proxy agent daemon: + + - listens for SNMP queries and commands from logically + remote network management stations, + - translates and retransmits those as appropriate + network management queries or cache lookups, + - listens for and parses the responses, + - translates the responses into SNMP responses, and + - returns those responses as SNMP messages to the + network management station that originated the + transaction. + + The proxy agent daemon also emits SNMP traps to + identified trap receivers. The proxy agent daemon is + designed to make the addition of additional vendor- + specific variables a straight-forward task. The proxy + application comes complete with source code including a + powerful set of portable libraries for generating and + parsing SNMP messages and a set of command line utilities. + + MECHANISM + Network management variables are made available for + inspection and/or alteration by means of the Simple + Network Management Protocol (SNMP). + + + + +NOCTools2 Working Group [Page 129] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + This application is a template for proxy application + writers. + + Only a few of the many LanBridge 100 variables are + supported. + + HARDWARE REQUIRED + System from Sun Microsystems, Incorporated. + + SOFTWARE REQUIRED + Sun OS 3.5 or 4.x. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 130] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog PROXYD_SNMP_RESEARCH + + NAME + proxyd -- SNMP proxy agent daemons from SNMP Research. + + KEYWORDS + control, management, status; + bridge, Ethernet, IP, OSI, ring, star; + NMS, SNMP; + UNIX; + library, sourcelib. + + ABSTRACT + SNMP proxy agents may be used to permit the monitoring + and controlling of network elements which are otherwise + not addressable using the SNMP management protocol + (e.g., a network bridge that implements a proprietary + management protocol). Similarly, SNMP proxy agents may + be used to protect SNMP agents from redundant network + management agents through the use of caches. Finally, + SNMP proxy agents may be used to implement elaborate + MIB access policies. + + The proxy agent daemon: + + - listens for SNMP queries and commands from logically + remote network management stations, + - translates and retransmits those as appropriate + network management queries or cache lookups, + - listens for and parses the responses, + - translates the responses into SNMP responses, and + - returns those responses as SNMP messages to the + network management station that originated the + transaction. + + The proxy agent daemon also emits SNMP traps to + identified trap receivers. The proxy agent daemon is + designed to make the addition of additional vendor- + specific variables a straight-forward task. The proxy + application comes complete with source code including a + powerful set of portable libraries for generating and + parsing SNMP messages and a set of command line utilities. + + MECHANISM + Network management variables are made available for + inspection and/or alteration by means of the Simple + Network Management Protocol (SNMP). + + + + +NOCTools2 Working Group [Page 131] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + This application is a template for proxy application + writers. + + Only a few of the many LanBridge 100 variables are + supported. + + HARDWARE REQUIRED + System from Sun Microsystems, Incorporated. + + SOFTWARE REQUIRED + Sun OS 3.5 or 4.x. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 132] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog QUERY + + NAME + query, ripquery + + KEYWORDS + routing; IP; spoof; UNIX; free. + + ABSTRACT + Query allows remote viewing of a gateway's routing + tables. + + MECHANISM + Query formats and sends a RIP request or POLL command + to a destination gateway. + + CAVEATS + Query is intended to be used a a tool for debugging + gateways, not for network management. SNMP is the pre- + ferred protocol for network management. + + BUGS + None known. + + LIMITATIONS + The polled gateway must run RIP. + + HARDWARE REQUIRED + No restriction. + + SOFTWARE REQUIRED + 4.3BSD UNIX or related OS. + + AVAILABILITY + Available with routed and gated distributions. + + Routed may be obtained via anonymous FTP from + uunet.uu.net, in file bsd- + sources/src/network/routed.tar.Z. + + Gated may be obtained via anonymous FTP from + devvax.tn.cornell.edu. Distribution files are in + directory pub/gated. + + + + + + + + +NOCTools2 Working Group [Page 133] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SAS-CPE + + NAME + SAS/CPE(tm) for Open Systems Software + + KEYWORDS + manager, status; + bridge, ethernet, FDDI, IP, OSI, NFS; + X; + DOS, HP, UNIX; + library. + + ABSTRACT + SAS/CPE(tm) for Open Systems software is an integrated system designed + to facilitate the analysis and presentation of computer performance + and resource utilization data. SAS/CPE software features include: + + . Processing of raw computer and network performance data into + detail-level SAS data sets. + . Conversion and validation of logged data values to forms + more useful for display and analysis (e.g., I/O counts + are converted to I/O rates per second). + . Numerous sample reports on performance data processed by + SAS/CPE software. + . Reduction of logged performance data into daily, weekly, + monthly or yearly summarized values. + . Menu-driven interface to the creation and management of multiple + performance data bases. + . Menu-driven report designing interface that allows users with no + programming knowledge to create and manage custom reports from + their performance data base. No SAS coding is needed for this + interface. + + MECHANISM + SAS/CPE for Open Systems processes and reports data + from SNMP and other proprietary monitoring protocols, + as well as du and accounting. + + CAVEATS + The product is currently in alpha testing. + + BUGS + None known. + + LIMITATIONS + None reported. + + + + + +NOCTools2 Working Group [Page 134] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HARDWARE REQUIRED + HP, SUN or IBM Workstation + + SOFTWARE REQUIRED + The SAS(r) System Base Software, SAS/GRAPH Software and + SAS/CPE for Open System Software + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + SAS/CPE for Open Systems Software is available from: + SAS Institute Inc. + SAS Campus Drive + Cary, NC 27513 + Phone 919-677-8000 + FAX 919-677-8123 + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Send email to snodjs@mvs.sas.com. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 135] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SNIFFER + + NAME + Sniffer + + KEYWORDS + analyzer, generator, traffic; DECnet, ethernet, IP, + NFS, OSI, ring, SMTP, star; eavesdrop; standalone. + + ABSTRACT + The Network General Sniffer is a protocol analyzer for + performing LAN diagnostics, monitoring, traffic genera- + tion, and troubleshooting. The Sniffer protocol + analyzer has the capability of capturing every packet + on a network and of decoding all seven layers of the + OSI protocol model. Capture frame selection is based + on several different filters: protocol content at lower + levels; node addresses; pattern matching (up to 8 + logically-related patterns of 32 bytes each); and des- + tination class. Users may extend the protocol + interpretation capability of the Sniffer by writing + their own customized protocol interpreters and linking + them to the Sniffer software. + + The Sniffer displays network traffic information and + performance statistics in real time, in user-selectable + formats. Numeric station addresses are translated to + symbolic names or manufacturer ID names. Network + activities measured include frames accepted, Kbytes + accepted, and buffer use. Each network version has + additional counters for activities specific to that + network. Network activity is expressed as + frames/second, Kbytes/second, or per cent of network + bandwidth utilization. + + Data collection by the Sniffer may be output to printer + or stored to disk in either print-file or spread-sheet + format. + + Protocol suites understood by the Sniffer include: + Banyan Vines, IBM Token-Ring, Novell Netware, XNS/MS- + Net (3Com 3+), DECnet, TCP/IP (including SNMP and + applications-layer protocols such as FTP, SMTP, and + TELNET), X Windows (for X version 11), NFS, and several + SUN proprietary protocols (including mount, pmap, RPC, + and YP). Supported LANs include: ethernet, Token-ring + (4Mb and 16Mb versions), ARCNET, StarLAN, IBM PC Net- + work (Broadband), and Apple Localtalk Network. + + + +NOCTools2 Working Group [Page 136] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + The Sniffer is a self-contained, portable protocol + analyzer that require only AC line power and connection + to a network to operate. Normally passive (except when + in Traffic Generator mode), it captures images of all + or of selected frames in a working buffer, ready for + immediate analysis and display. + + The Sniffer is a standalone device. Two platforms are + available: one for use with single network topologies, + the other for use with multi-network topologies. Both + include Sniffer core software, a modified network + interface card (or multiple cards), and optional proto- + col interpreter suites. + + All Sniffer functions may be remotely controlled from a + modem-connected PC. Output from the Sniffer can be + imported to database or spreadsheet packages. + + CAVEATS + In normal use, the Sniffer is a passive device, and so + will not adversely effect network performance. Perfor- + mance degradation will be observed, of course, if the + Sniffer is set to Traffic Generator mode and connected + to an active network. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + None. The Sniffer is a self-contained unit, and + includes its own interface card. It installs into a + network as would any normal workstation. + + SOFTWARE REQUIRED + None. + + + + + + + + + + + + +NOCTools2 Working Group [Page 137] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + The Sniffer is available commercially. For information + on your local representative, call or write: + Network General Corporation + 4200 Bohannon Drive + Menlo Park, CA 94025 + Phone: 415-688-2700 + Fax: 415-321-0855 + + For acquisition by government agencies, the Sniffer is + included on the GSA schedule. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 138] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SNMP_DEVELOPMENT_KIT + + NAME + The SNMP Development Kit + + KEYWORDS + manager, status; IP; NMS, SNMP; UNIX; free, sourcelib. + + ABSTRACT + The SNMP Development Kit comprises C Language source + code for a programming library that facilitates access + to the management services of the SNMP (RFC 1098). + Sources are also included for a few simple client + applications whose main purpose is to illustrate the + use of the library. Example client applications query + remote SNMP agents in a variety of modes, and generate + or collect SNMP traps. Code for an example SNMP agent + that supports a subset of the Internet MIB (RFC 1066) + is also included. + + MECHANISM + The Development Kit facilitates development of SNMP- + based management applications -- both clients and + agents. Example applications execute SNMP management + operations according to the values of command line + arguments. + + CAVEATS + None. + + BUGS + Fixed in the next release. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + The SNMP library source code is highly portable and + runs on a wide range of platforms. + + SOFTWARE REQUIRED + The SNMP library source code has almost no operating + system dependencies and runs in a wide range of + environments. Certain portions of the example SNMP + agent code are specific to the 4.3BSD implementation of + the UNIX system for the DEC MicroVAX. + + + + + +NOCTools2 Working Group [Page 139] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + The Development Kit is available via anonymous FTP from + host allspice.lcs.mit.edu. The copyright for the + Development Kit is held by the Massachusetts Institute + of Technology, and the Kit is distributed without + charge according to the terms set forth in its code and + documentation. The distribution takes the form of a + UNIX tar file. + + Bug reports, questions, suggestions, or complaints may + be mailed electronically to snmp-dk@ptt.lcs.mit.edu, + although no response in any form is guaranteed. Dis- + tribution via UUCP mail may be arranged by contacting + the same address. Requests for hard-copy documentation + or copies of the distribution on magnetic media are + never honored. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 140] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SNMP_Libraries_SNMP_RESEARCH + + NAME + SNMP Libraries and Utilities from SNMP Research. + + KEYWORDS + alarm, control, manager, map, security, status; + bridge, DECnet, Ethernet, FDDI, IP, OSI, ring, star; + NMS, SNMP; + DOS, UNIX, VMS; + sourcelib. + + ABSTRACT + The SNMP Libraries and Utilities serve two purposes: + + 1) to act as building blocks for the construction of + SNMP-based agent and manager applications; and + + 2) to act as network management tools for network + fire fighting and report generation. + + The libraries perform ASN.1 parsing and generation tasks + for both network management station applications and + network management agent applications. These libraries + hide the details of ASN.1 parsing and generation from + application writers and make it unnecessary for them to + be expert in these areas. The libraries are very robust + with considerable error checking designed in. The + several command line utilities include applications for + retrieving one or many variables, retrieving tables, or + effecting commands via the setting of remote network + management variables. + + MECHANISM + The parsing is performed via recursive descent methods. + Messages are passed via the Simple Network Management + Protocol (SNMP). + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + The monitored and managed nodes must implement the SNMP + over UDP per RFC 1157 or must be reachable via a proxy + agent. + + + +NOCTools2 Working Group [Page 141] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HARDWARE REQUIRED + This software has been ported to numerous platforms + including workstations, general-purpose timesharing + systems, and embedded hardware in intelligent network + devices such as repeaters, bridges, and routers. + + SOFTWARE REQUIRED + C compiler, TCP/IP library. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 142] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SNMP_PACKAGED_AGENT_SNMP_RESEARCH + + NAME + SNMP Packaged Agent System -- an SNMP host/gateway + agent daemon including a complete protocol stack and + runtime environment required to support an SNMP Agent + from SNMP Research. + + KEYWORDS + control, manager, status; + bridge, Ethernet, FDDI, IP, OSI, ring, star; + NMS, SNMP; + DOS, standalone, UNIX; + sourcelib. + + ABSTRACT + The snmpd agent daemon listens for and responds to + network management queries and commands from logically + remote network management stations. The agent daemon + also emits SNMP traps to identified trap receivers. + The agent daemon is designed to make the addition of + additional vendor-specific variables a + straight-forward task. The snmpd application comes + complete with source code including a powerful set of + portable libraries for generating and parsing SNMP + messages and a set of command line utilities. + + The Packaged Agent System is designed to aid the + hardware manufacturer who is not experienced with the + TCP/IP protocol suite. A lightweight, non-preemptive + scheduler/tasking system for faster execution and less + impact on slow CPUs is included in the package. + Development environment is either MS DOS or UNIX. + + MECHANISM + Network management variables are made available for + inspection and/or alteration by means of the Simple + Network Management Protocol (SNMP). + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + + + +NOCTools2 Working Group [Page 143] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HARDWARE REQUIRED + The Motorola 68XXX and the Intel 8088 and X86 + platforms are fully supported. Other platforms can be + supported. Contact SNMP Research for details. + + This software has been ported to numerous platforms + including workstations, general-purpose timesharing + systems, and embedded hardware in intelligent network + devices such as repeaters, bridges, and routers. + + SOFTWARE REQUIRED + C compiler. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 144] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SNMPD_SNMP_RESEARCH + + NAME + snmpd -- an SNMP host/gateway agent daemon from SNMP + Research. + + KEYWORDS + control, mananger, status; + bridge, Ethernet, FDDI, IP, OSI, ring, star; + NMS, SNMP; + DOS, UNIX; + sourcelib. + + ABSTRACT + The snmpd agent daemon listens for and responds to + network management queries and commands from logically + remote network management stations. The agent daemon + also emits SNMP traps to identified trap receivers. The + agent daemon is architected to make the addition of + additional vendor-specific variables a straight-forward + task. The snmpd application comes complete with source + code including a powerful set of portable libraries for + generating and parsing SNMP messages and a set of + command line utilities. + + MECHANISM + Network management variables are made available for + inspection and/or alteration by means of the Simple + Network Management Protocol (SNMP). + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + Only operating system variables available without + source code modifications to the operating system and + device device drivers are supported. + + HARDWARE REQUIRED + This software has been ported to numerous platforms + including workstations, general-purpose timesharing + systems, and embedded hardware in intelligent network + devices such as repeaters, bridges, and routers. + + + + + +NOCTools2 Working Group [Page 145] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SOFTWARE REQUIRED + C compiler. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 146] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SPIDERMONITOR + + NAME + SpiderMonitor P220, K220 and + SpiderAnalyzer P320, K320 + + KEYWORDS + alarm, analyzer, generator, traffic; DECnet, ethernet, + IP, OSI; eavesdrop; standalone; sourcelib. + + ABSTRACT + The SpiderMonitor and SpiderAnalyzer are protocol + analyzers for performing ethernet LAN diagnostics, mon- + itoring, traffic generation, and troubleshooting. The + SpiderMonitor has the capability of capturing every + packet on a network and of decoding the first four + layers of the OSI protocol model. The SpiderAnalyzer + has additional software for decoding higher protocol + layers. Protocol suites understood: TCP/IP (including + SNMP and applications-layer protocols), OSI, XNS, DEC- + net and IPX. User-definable decodes can be written in + 'C' with the Microsoft version 5.0 'C' compiler. A + decode guide is provided. + + The SpiderAnalyzer supports multiple simultaneous + filters for capturing packets using predefined patterns + and error states. Filter patterns can also trigger on + NOT matching 1 or more filters, an alarm, or a speci- + fied time. + + The SpiderAnalyzer can also employ TDR (Time Domain + Reflectometry) to find media faults, open or short cir- + cuits, or transceiver faults. It can transmit OSI, + XNS, and Xerox link-level echo packets to user- + specified stations, performs loop round tests. + + In traffic generation mode, the SpiderAnalyzer has the + ability to generate packets at random intervals of ran- + dom lengths or any combination of random or fixed + interval or length, generation of packets with CRC + errors, or packets that are too short, or packets that + are too long. + + Output from the SpiderMonitor/Analyzer can be imported + to database or spreadsheet packages. + + + + + + +NOCTools2 Working Group [Page 147] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + The SpiderMonitor and Spider Analyzer are available as + stand-alone, IBM PC compatible packages based upon a + Compaq III portable system, or as a plug-in boards for + any IBM XT/AT compatible machine. The model 220 (Spi- + derMonitor) systems provide a functional base suited + for most network management needs. The model 320 (Spi- + derAnalyzer) systems provide extended functionality in + the development mode and traffic generation mode as + well more filtering capabilities than the 220 models. + + CAVEATS + Traffic generation will congest an operational ether- + net. + + BUGS + None known. + + LIMITATIONS + Monitoring of up to 1024 stations and buffering of up + to 1500 packets. The model 220 provides for 3 filters + with a filter depth of 46 bytes. The model 320 pro- + vides for 4 filters and a second level of filtering + with a filter depth of 64 bytes. + + HARDWARE REQUIRED + PX20s are self contained, the KX20s require an IBM + PC/XT-AT compatible machine with 5 megabytes of hard + disk storage and the spare slot into which the board + kit is plugged. + + SOFTWARE REQUIRED + None. The SpiderAnalyzer requires the Microsoft 'C' + Compiler, Version 5.0 for writing user defined decodes. + + AVAILABILITY + The SpiderMonitor/Analyzer is available commercially. + For information on your local representative, call or + write: + Spider Systems, Inc. + 12 New England Executive Park + Burlington, MA 01803 + Telephone: 617-270-3510 + FAX: 617-270-9818 + + + + + + + +NOCTools2 Working Group [Page 148] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SPIMS + + NAME + SPIMS -- the Swedish Institute of Computer Science + (SICS) Protocol Implementation Measurement + System tool. + + KEYWORDS + benchmark, debugger; IP, OSI; spoof; UNIX. + + ABSTRACT + SPIMS is used to measure the performance of protocol + and "protocol-like" services including response time + (two-way delay), throughput and the time to open and + close connections. It has been used to: + + o benchmark alternative protocol implementations, + + o observe how performance varies when parameters in + specific implementations have been varied (i.e., + to tune parameters). + + SPIMS currently has interfaces to the DoD Internet Pro- + tocols: UDP, TCP, FTP, SunRPC, the OSI protocols from + the ISODE 4.0 distribution package: FTAM, ROSE, ISO TP0 + and to Sunlink 5.2 ISO TP4 as well as Stanford's VMTP. + Also available are a rudimentary set of benchmarks, + stubs for new protocol interfaces and a user manual. + + For an example of the use of SPIMS to tune protocols, + see: + Nordmark & Cheriton, "Experiences from VMTP: How + to achieve low response time," IFIP WG6.1/6.4: + Protocols for High-Speed Networks, May 1989, + Zurich. To be published. + + For an example of how SPIMS can be used to benchmark + protocols, see: + + Gunningberg, Bjorkman, Nordmark, Sjodin, Pink & + Stromqvist "Application Protocols and Performance + Benchmarks", IEEE Communications Magazine, June + 1989, Vol. 27, No.6, pp 30-36. + + Sjodin, Gunningberg, Nordmark, & Pink, "Towards + Protocol Benchmarks', IFIP WG6.1/6.4 Protocols + for High-Speed Networks, May 1989, Zurich, pp + 57-67 + + + +NOCTools2 Working Group [Page 149] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + SPIMS runs as user processes and uses a TCP connection + for measurement set-up. Measurements take place + between processes over the measured protocol. SPIMS + generates messages and transfers them via the measured + protocol service according to a user-supplied specifi- + cation. SPIMS has a unique measurement specification + language that is used to specify a measurement session. + In the language there are constructs for different + application types (e.g., bulk data transfer), for + specifying frequency and sequence of messages, for dis- + tribution over message sizes and for combining basic + specifications. These specifications are independent + of both protocols and protocol implementations and can + be used for benchmarking. For more details on the + internals of SPIMS, see: + + Nordmark & Gunningberg, "SPIMS: A Tool for Protocol + Implementation Performance Measurements" Proc. of 13:th + Conf. on Local Computer Networks, Minneapolis 1989, pp + 222-229. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + SPIMS is implemented on UNIX, including SunOS 4., + 4.3BSD UNIX, DN (UNIX System V, with extensions) and + Ultrix 2.0/3.0. It requires a TCP connection for meas- + urement set-up. No kernel modifications or any modifi- + cations to measured protocols are required. + + + + + + + + + + + +NOCTools2 Working Group [Page 150] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + SPIMS is not in the public domain and the software is + covered by licenses. Use of the SPIMS software + represents acceptance of the terms and conditions of + the licenses. + The licenses are enclosed in the distribution package. + Licenses and SPIMS cover letter can also be obtained + via an Internet FTP connection without getting the whole + software. The retrieval procedure is identical to the + below university distribution via FTP. The file to + retrieve is pub/spims-dist/licenses.tar.Z + + There are two different distribution classes depending on + requesting organization: + + 1. Universities and non-profit organizations. + + To these organizations, SPIMS source code is distributed + free of charge. There are two ways to get the software: + + 1. FTP. + If you have an Internet FTP connection, you + can use anonymous FTP to sics.se + [192.16.123.90], and retrieve the file + pub/spims-dist/dist910304.tar.Z + (this is a .6MB compressed tar image) in + BINARY mode. Log in as user anonymous and at + the password prompt, use your complete + electronic mail address. + + 2. On a Sun 1/4-inch cartridge tape. + For mailing, a handling fee of US$150.00 will be + charged. Submit a bank check with the request. + Do not send tapes or envelopes. + + 2. Commercial organizations. + + These organizations can chose between a license for + commercial use, or a license for internal research + only and no commercial use whatsoever. + + For internal research use only: + + The SPIMS source code is distributed for a one + time fee of US$500.00. Organizations + interested in the research prototype need to + contact us via e-mail and briefly motivate why + they qualify (non-commercial use) for the + + + +NOCTools2 Working Group [Page 151] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + research prototype. + They will thereafter get a permission to + obtain a copy from the same distribution + source as for universities. + + Commercial use: + + A commercial version of SPIMS will eventually + be distributed and supported by a commercial + partner. nIn the meantime we will distribute + the research prototype (source code) to + interested organizations without any guaranty + or support. Contact SICS for further + information. + + For more information about the research prototype + distribution and about a commercial license, contact: + + Swedish Institute of Computer Science + Att: Birgitta Klingenberg + P.O. Box 1263 + S-164 28 Kista + SWEDEN + + e-address: spims@sics.se + Phone: +46-8-7521500, Fax: +46-8-7517230 + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Bengt Ahlgren + Swedish Institute of Computer Science + Box 1263 + S-164 28 KISTA, SWEDEN + + Email: bengta@sics.se + Tel: +46 8 752 1562 (direct) + or +46 8 752 1500 + Fax: +46 8 751 7230 + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 152] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog SPRAY_SUN + + NAME + spray + + KEYWORDS + benchmark, generator; IP; ping; UNIX. + + ABSTRACT + Spray is a traffic generation tool that generates RPC + or UDP packets, or ICMP Echo Requests. The packets are + sent to a remote procedure call application at the des- + tination host. The count of received packets is + retrieved from the remote application after a certain + number of packets have been transmitted. The differ- + ence in packets received versus packets sent represents + (on a LAN) the packets that the destination host had to + drop due to increasing queue length. A measure of + throughput relative to system speed and network load + can thus be obtained. + + MECHANISM + See above. + + CAVEATS + Spray can congest a network. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + SunOS + + AVAILABILITY + Supplied with SunOS. + + + + + + + + + + +NOCTools2 Working Group [Page 153] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog TCPDUMP + + NAME + tcpdump + + KEYWORDS + traffic; ethernet, IP, NFS; UNIX, VMS; free. + + ABSTRACT + Tcpdump can interpret and print headers for the follow- + ing protocols: ethernet, IP, ICMP, TCP, UDP, NFS, ND, + ARP/RARP, AppleTalk. Tcpdump has proven useful for + examining and evaluating the retransmission and window + management operations of TCP implementations. + + MECHANISM + Much like etherfind, tcpdump writes a log file of the + frames traversing an ethernet interface. Each output + line includes the time a packet is received, the type + of packet, and various values from its header. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + Public domain version requires a kernel patch for + SunOS. TCPware for VMS - currently interprets headers + for IP, TCP, UDP, and ICMP only. + + HARDWARE REQUIRED + Any Ultrix system (VAX or DEC RISC hardware) + + SOFTWARE REQUIRED + Ultrix release 4.0 or later. For Ultrix 4.1, may + require the patched "if_ln.o" kernel module, available + from Digital's Customer Support Center. + + + + + + + + + + + + +NOCTools2 Working Group [Page 154] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + Available, though subject to copyright restrictions, + via anonymous FTP from ftp.ee.lbl.gov. The source and + documentation for the tool is in compressed tar format, + in file tcpdump.tar.Z. Also available from + spam.itstd.sri.com, in directory pub. For VMS hosts + with DEC ethernet controllers, available as part of TGV + MultiNet IP software package and TCPware for VMS from + Process Software Corporation. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 155] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog TCPLOGGER + + NAME + tcplogger + + KEYWORDS + traffic; IP; eavesdrop; UNIX; free. + + ABSTRACT + Tcplogger consists of modifications to the 4.3BSD UNIX + source code, and a large library of post-processing + software. Tcplogger records timestamped information + from TCP and IP packets that are sent and received on a + specified connection. For each TCP packet, information + such as sequence number, acknowledgement sequence + number, packet size, and header flags is recorded. For + an IP packet, header length, packet length and TTL + values are recorded. Customized use of the TCP option + field allows the detection of lost or duplicate pack- + ets. + + MECHANISM + Routines of 4.3BSD UNIX in the netinet directory have + been modified to append information to a log in memory. + The log is read continuously by a user process and + written to a file. A TCP option has been added to + start the logging of a connection. Lots of post- + processing software has been written to analyze the + data. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + To get a log at both ends of the connection, the modi- + fied kernel should be run at both the hosts. + + All connections are logged in a single file, but + software is provided to filter out the record of a sin- + gle connection. + + HARDWARE REQUIRED + No restrictions. + + + + + +NOCTools2 Working Group [Page 156] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SOFTWARE REQUIRED + 4.3BSD UNIX (as modified for this tool). + + AVAILABILITY + Free, although a 4.3BSD license is required. Contact + Olafur Gudmundsson (ogud@cs.umd.edu). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 157] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog TOKENVIEW_PROTEON + + NAME + TokenVIEW + + KEYWORDS + control, manager, status; ring; NMS, proprietary; DOS. + + ABSTRACT + Network Management tool for 4/16 Mbit IEEE 802.5 Token + Ring Networks. Monitors active nodes and ring errors. + Maintains database of nodes, wire centers and their + connections. Separate network management ring allows + remote configuration of wire centers. + + MECHANISM + A separate network management ring used with Proteon + Intelligent Wire Centers allows wire center configura- + tion information to be read and modified from a single + remote workstation. A log of network events used with + a database contain nodes, wire centers and their con- + nections, facilitates tracking and correction of net- + work errors. Requires an "E" series PROM, sold with + package. + + CAVEATS + Currently, only ISA bus cards support the required E + series PROM. + + BUGS + None known. + + LIMITATIONS + 256 nodes, 1 net. + + HARDWARE REQUIRED + 512K RAM, CGA or better, hard disk, mouse supported. + + SOFTWARE REQUIRED + MS-DOS, optional mouse driver + + AVAILABILITY + Fully supported product of Proteon, Inc. Previously + sold as Advanced Network Manager (ANM). For more in- + formation, contact: + Proteon, Inc. Phone: (508) 898-2800 + 2 Technology Drive Fax: (508) 366-8901 + Westborough, MA 01581 Telex: 928124 + + + +NOCTools2 Working Group [Page 158] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog TRACEROUTE + + NAME + traceroute + + KEYWORDS + routing; IP; ping; UNIX, VMS; free. + + ABSTRACT + Traceroute is a tool that allows the route taken by + packets from source to destination to be discovered. + It can be used for situations where the IP record route + option would fail, such as intermediate gateways dis- + carding packets, routes that exceed the capacity of an + datagram, or intermediate IP implementations that don't + support record route. Round trip delays between the + source and intermediate gateways are also reported + allowing the determination of individual gateways con- + tribution to end-to-end delay. + + Enhanced versions of traceroute have been developed + that allow specification of loose source routes for + datagrams. This allows one to investigate the return + path from remote machines back to the local host. + + MECHANISM + Traceroute relies on the ICMP TIME_EXCEEDED error + reporting mechanism. When an IP packet is received by + an gateway with a time-to-live value of 0, an ICMP + packet is sent to the host which generated the packet. + By sending packets to a destination with a TTL of 0, + the next hop can be identified as the source of the + ICMP TIME EXCEEDED message. By incrementing the TTL + field the subsequent hops can be identified. Each + packet sent out is also time stamped. The time stamp + is returned as part of the ICMP packet so a round trip + delay can be calculated. + + CAVEATS + Some IP implementations forward packets with a TTL of + 0, thus escaping identification. Others use the TTL + field in the arriving packet as the TTL for the ICMP + error reply, which delays identification. + + Sending datagrams with the source route option will + cause some gateways to crash. It is considered poor + form to repeat this behavior. + + + + +NOCTools2 Working Group [Page 159] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + BUGS + None known. + + LIMITATIONS + Most versions of UNIX have errors in the raw IP code + that require kernel mods for the standard version of + traceroute to work. A version of traceroute exists + that runs without kernel mods under SunOS 3.5 (see + below), but it only operates over an ethernet inter- + face. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX or related OS, or VMS. + + AVAILABILITY + Available by anonymous FTP from ftp.ee.lbl.gov, in file + traceroute.tar.Z. It is also available from + uc.msc.umn.edu. + + A version of traceroute that supports Loose Source + Record Route, along with the source code of the + required kernel modifications and a Makefile for + installing them, is available via anonymous FTP from + zerkalo.harvard.edu, in directory pub, file + traceroute_pkg.tar.Z. + + A version of traceroute that runs under SunOS 3.5 and + does NOT require kernel mods is available via anonymous + FTP from dopey.cs.unc.edu, in file + ~ftp/pub/traceroute.tar.Z. + + For VMS, traceroute is available as part of TGV Mul- + tiNet IP software package. + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 160] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog TRPT + + NAME + TRPT -- transliterate protocol trace + + KEYWORDS + traffic; IP; eavesdrop; UNIX; free. + + ABSTRACT + TRPT displays a trace of a TCP socket events. When no + options are supplied, TRPT prints all the trace records + found in a system, grouped according to TCP connection + protocol control block (PCB). + + An example of TRPT output is: + + 38241 ESTABLISHED:input + [e0531003..e0531203)@6cc5b402(win=4000) -> ESTA- + BLISHED + 38241 ESTABLISHED:user RCVD -> ESTABLISHED + 38266 ESTABLISHED:output + 6cc5b402@e0531203(win=4000) -> ESTABLISHED + 38331 ESTABLISHED:input + [e0531203..e0531403)@6cc5b402(win=4000) + -> CLOSE_WAIT + 38331 CLOSE_WAIT:output + 6cc5b402@e0531404(win=3dff) -> CLOSE_WAIT + 38331 CLOSE_WAIT:user RCVD -> CLOSE_WAIT + 38343 LAST_ACK:output + 6cc5b402@e0531404(win=4000) -> LAST_ACK + 38343 CLOSE_WAIT:user DISCONNECT -> LAST_ACK + 38343 LAST_ACK:user DETACH -> LAST_ACK + + MECHANISM + TRPT interrogates the buffer of TCP trace records that + is created when a TCP socket is marked for debugging. + + CAVEATS + Prior to using TRPT, an analyst should take steps to + isolate the problem connection and find the address of + its protocol control blocks. + + BUGS + None reported. + + + + + + + +NOCTools2 Working Group [Page 161] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LIMITATIONS + A socket must have the debugging option set for TRPT to + operate. Another problem is that the output format of + TRPT is difficult. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX or related OS. + + AVAILABILITY + Included with BSD and SunOS distributions. Available + via anonymous FTP from uunet.uu.net, in file bsd- + sources/src/etc/trpt.tar.Z. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 162] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog TTCP + + NAME + TTCP + + KEYWORDS + benchmark, generator; IP; ping; UNIX, VMS; free. + + ABSTRACT + TTCP is a traffic generator that can be used for test- + ing end-to-end throughput. It is good for evaluating + TCP/IP implementations. + + MECHANISM + Cooperating processes are started on two hosts. The + open a TCP connection and transfer a high volume of + data. Delay and throughput are calculated. + + CAVEATS + Will greatly increase system load. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + No restrictions. + + SOFTWARE REQUIRED + BSD UNIX or related OS, or VMS. + + AVAILABILITY + Source for BSD UNIX is available via anonymous FTP from + vgr.brl.mil, in file ftp/pub/ttcp.c, and from sgi.com, + in file sgi/src/ttcp.c. A version of TTCP has also + been submitted to the USENET news group + comp.sources.unix. For VMS, ttcp.c is included in the + MultiNet Programmer's Kit, a standard feature of TGV + MultiNet IP software package. + + + + + + + + + + +NOCTools2 Working Group [Page 163] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog UNISYS-PARAMAX + + NAME + Paramax Network Security Server + + KEYWORDS + alarm, control, manager, security, status; + ethernet, FDDI, IP; X; UNIX. + + ABSTRACT + The Paramax Network Security Server (NSS) is a + security officer's tool for centralized security + management of TCP/IP-based networks. The NSS provides + capability for collection, on-line storage, + maintenance, and correlation of audit data from hosts, + workstations, servers, and network devices. Through + the X window based user interface, a security officer + can review and analyze this audit data at the NSS, + select and request filtered portions of host audit + data, and receive and analyze security alerts from + across the network. The NSS supports centralized + access control of network resources through its + capability to create and update user and host access + permissions data. The user access permissions data + identifies network addresses that each user is + permitted to access. The host access permissions data + identifies network addresses between which + communication is permitted. The NSS supports + centralized management of user authentication data + (user IDs and passwords) and other user data for use + by hosts, workstations, and servers in the network. + It generates pseudo-random pronounceable passwords for + selection and assignment to users by the security officer. + + The NSS deadman timer locks the NSS screen or logs the + security officer off the NSS after periods of + inactivity. A biometric authentication device is + optional for rigorous fingerprint authentication of + users at the NSS, and logins to the NSS itself are + permitted only at the console. The NSS currently + provides centralized security management for a System High + Network. It is being upgraded for a Compartmented Mode + environment. + + + + + + + + +NOCTools2 Working Group [Page 164] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + The NSS uses the Audit Information Transfer Protocol + (AITP) for the transfer of security alerts and audit + data. AITP is NOT proprietary, and the specification + is available from the address listed below. Access to + the NSS audit database is provided via the Structured + Query Language (SQL). + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Hardware required is a Sun 4 (SPARCStation) with a color + monitor, at least 600 MB disk, and 150 MB 1/4" + cartridge tape drive. + + SOFTWARE REQUIRED + SunOS Version 4.1.1 running the Sun OpenWindows X + windowing environment and the SYBASE Relational Data + Base Management System. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + Commercially available from: + Paramax Systems Corporation + 5151 Camino Ruiz + Camarillo, California 93011-6004 + 805-987-6811 + Peter Vazzana + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + Paramax Systems Corporation + 5151 Camino Ruiz + Camarillo, California 93011-6004 + 805-987-6811 + Nina Lewis + + + + + + + + + + +NOCTools2 Working Group [Page 165] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog WOLLONGONG-MANAGER + + NAME + Management Station, Release 3.0 + + KEYWORDS + manager; ; snmp, x; sun, dec, dos;. + + ABSTRACT + Management Station is a network management software + product that supports SNMP. Release 3.0 implements a + distributed network management architecture that helps + solve the scalability and reliability limitations of + using a single cpu for all SNMP management tasks. + Additionally, there are many applications provided + that are all user-configurable. The following + applications and their functionality is listed below: + + General Info: + + X Windows, 11.4 based implemented with OSF/Motif 1.1.1 + toolkit. X Windows interface for all configuration + files. Most applications have "verbose" mode for + display of SNMP PDU traffic. On-line help and + Reference manual pages. ANSI C compliant. + + Network Management Daemon: + + Responsible for device discovery, trap/alarm + management and fault monitoring for the network map. + Connection with other distributed daemons and any + connected stations is accomplished with SNMP/TCP. + Configured via Manager MIB; also incorporates SMUX MIB + (RFC 1227). Sends any information to INGRES, Oracle + or Sybase via an ESQL interface. User-defined actions + include: send alarm to map; send info to flat file; + execute ESQL command; call any UNIX system command; + forward traps and filter user-defined alarms. + User-defined alarms can use any boolean expression and + MIB variable expressions can be combined with AND/OR + statements. + + MIB Compiler + + ASN.1 MIB compiler with X Windows interface. Accepts + RFC 1155 and 1212 format. Most vendor-specific MIBs + and proposed Internet standard MIBs already included. + + + + +NOCTools2 Working Group [Page 166] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Network Map + + Comprehensive network monitoring map with click and + drag interface, hiearchical and virtual views. + Toolkit and preferences applications, device + discovery. Uses /etc/hosts file, NIS or DNS for + device resolution. Background pixmapping capability, + user-definable menu bar, network manager and console + operator modes via UNIX group permissions. Multiple + map use without limitation. + + MIB Form and MIB Form Editor + + User-designed, X-based SNMP applications. Alias for + MIB variables and interprets returned values. GET + NEXT and SET capability. User-defined polling and + multi-device [agent] capability. Configured via X + interface. + + MIB Chart and MIB Chart Editor + + Choice of strip chart, packed strip chart or bar + graphs. User-specified polling interval, MIB + variable(s) or MIB expressions using arithmetic + operands. Plot actual value, delta or delta/interval. + Plot multiple MIB expressions from multiple agents + simultaneously. X Windows interface. Pause polling + and grid options. + + MIB Tool + + X Windows application for the general viewing and + 'walking' of MIB trees. GET NEXT and SET options. + Window for viewing RFC 1212 MIB definitions. Command + line interface option. + + Application Programming Interface + + Complete set of APIs for developers to write SNMP + applications in character mode or X Windows. + + MECHANISM + Management Station uses SNMP and ICMP Echo Request to + monitor and control SNMP Agents. Network management + daemon implements Wollongong's Manager MIB, SNMP over + TCP and the SMUX protocol. + + + + + +NOCTools2 Working Group [Page 167] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + CAVEATS + none. + + BUGS + See Product Release Notice. + + LIMITATIONS + Limitations on number of management agents and network + management daemons not known at this time. + + HARDWARE REQUIRED + Sun SPARC workstations and servers + DEC DECstations and DECsystems + Motorola MPC (Delta 8000 series) + 3/486 PC and PC-compatible + + 16 MB RAM + n20 MB free disk space for installation + Color monitor strongly recommended + + SOFTWARE REQUIRED + SunOS 4.1-1 or greater & OpenWindows 2.0 or greater (SUN) + X Windows, 11.4 or greater + RISC ULTRIX 4.1 or greater (DEC) + R32V2 (Motorola) + Open Desktop 1.1 or greater (3/486) + + Provided on 1/4" cartridge, TK-50 or 3 1/2" diskettes, + as appropriate, in cpio format. + + AVAILABILITY + A commercial product of: + + The Wollongong Group, Inc. + 1129 San Antonio Rd + Palo Alto, CA. 94303 + ph.: (800) 962 - 8649 (in California) + (800) 872 - 8649 (outside California) + fax: (415) 962 - 0286 + + + + + + + + + + + + +NOCTools2 Working Group [Page 168] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog XNETDB + + NAME + Xnetdb + + KEYWORDS + database, manager, map, monitoring, status; IP; Ping, + SNMP, Unix, X; free. + + ABSTRACT + Xnetdb is a network monitoring tool based on X Windows + and SNMP which also has integrated database and + statistic viewing capabilities. Xnetdb will determine + and display the status of routers and circuits it has + been told to monitor by querying the designated sites + and displaying the result. It can also query the + status of certain designated SNMP variables, such as a + default route for an important router. Additionally, + it also has integrated database functionality in that + it can display additional information about a site or + circuit such as the equipment at the site, the contact + person(s) for the site, and other useful information. + Finally it can gather designated statistical + information about a circuit and display it on demand. + + MECHANISM + Xnetdb uses SNMP or ping to monitor things which its + configured to monitor. It dynamically builds a + network map on its display by querying entities and + obtaining IP addresses and subnet masks. A + configuration file tells xnetdb which IP hosts you + want to monitor. + + CAVEATS + While "ping" can be used to monitor hosts, more useful + results are obtained using SNMP. + + BUGS + Bugs and other assorted topics are discussed on the + xnetdb mailing list. To join, send a note to + "xnetdb-request@oar.net". + + LIMITATIONS + None. + + HARDWARE REQUIRED + No restrictions. + + + + +NOCTools2 Working Group [Page 169] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + SOFTWARE REQUIRED + Most any variety of UNIX plus X-Windows and/or + OpenWindows. + + AVAILABILITY + Available via anonymous ftp from ftp.oar.net + (currently 131.187.1.102) in the directory /pub/src. + Special arrangements can be made for sites without + direct IP access by sending a note to + "xnetdb-request@oar.net". There are minimal licensing + restrictions - these are detailed within the package. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 170] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog XNETMON_SNMP_RESEARCH + + NAME + XNETMON -- an X windows based SNMP network management + station from SNMP Research. + + KEYWORDS + alarm, benchmark, control, debugger, manager, map, + reference, security, status, traffic; + bridge, DECnet, Ethernet, FDDI, IP, OSI, ring, star; + NMS, Ping, SNMP, X; + UNIX; + Sourcelib. + + ABSTRACT + The XNETMON application implements a powerful network + management station based on the X window system. + XNETMON's network management tools for configuration, + performance, security, and fault management have been + used successfully with a wide assortment of wide- and + local-area-network topologies and medias. + Multiprotocol devices are supported + including those using TCP/IP, DECnet, and OSI + protocols. + + Some features of XNETMON's network management tools include: + + o Fault management tool displays a map of the network + configuration with node and link state indicated + in one of several colors to indicate current status; + o Configuration management tool may be used to edit the + network management information base stored in the + NMS to reflect changes occurring in the network; + o Graphs and tabular tools for use in fault and performance + management (e.g. XNETPERFMON); + o Mechanisms by which additional variables, such as vendor- + specific variables, may be added; + o Alarms may be enabled to alert the operator of events + occurring in the network; + o Events are logged to disk; + o Output data may be transferred via flat files for + additional report generation by a variety of + statistical packages. + + The XNETMON application comes complete with source + code including a powerful set of portable libraries + for generating and parsing SNMP messages. + + + + +NOCTools2 Working Group [Page 171] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + MECHANISM + XNETMON is based on the Simple Network Management + Protocol (SNMP). Polling is performed via the + powerful SNMP get-next operator and the SNMP get + operator. Trap-directed polling is used to regulate + focus and intensity of the polling. + + CAVEATS + None. + + BUGS + None known. + + LIMITATIONS + Monitored and managed nodes must implement the SNMP over + UDP per RFC 1157 or must be reachable via a proxy agent. + + HARDWARE REQUIRED + X windows workstation with UDP socket library. + Monochrome is acceptable, but color is far superior. + + SOFTWARE REQUIRED + X windows version 11 release 4 or later or MOTIF. + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 172] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog XNETMON_WELLFLEET + + NAME + xnetmon, xpmon + + KEYWORDS + alarm, manager, map, status; IP; NMS, SNMP; UNIX. + + ABSTRACT + Xnetmon and xpmon provide graphical representation of + performance and status of SNMP-capable network ele- + ments. Xnetmon presents a schematic network map + representing the up/down status of network elements; + xpmon draws a pen plot style graph of the change over + time of any arbitrary MIB object (RFC1066). Both xnet- + mon and xpmon use the SNMP (RFC1098) for retrieving + status and performance data. + + MECHANISM + Xnetmon polls network elements for the status of their + interfaces on a controllable polling interval. Pop-up + windows displaying the values of any MIB variable are + supported by separate polls. When SNMP traps are + received from a network element, that element and all + adjacent elements are immediately re-polled to update + their status. The layout of the network map is stati- + cally configured. Xpmon repeatedly polls (using SNMP) + the designated network element for the value of the + designated MIB variable on the user-specified interval. + The change in the variable is then plotted on the strip + chart. The strip chart regularly adjusts its scale to + the current maximum value on the graph. + + CAVEATS + Polling intervals should be chosen with care so as not + to affect system performance adversely. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Distributed and supported for Sun-3 systems. + + SOFTWARE REQUIRED + SunOS 3.5 or 4.x; X11, release 2 or 3. + + + +NOCTools2 Working Group [Page 173] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY + Commercial product of: + Wellfleet Communications, Inc. + 12 DeAngelo Drive + Bedford, MA 01730-2204 + (617) 275-2400 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 174] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog XNETPERFMON_SNMP_RESEARCH + + NAME + xnetperfmon -- a graphical network performance and + fault management tool from SNMP Research. + + KEYWORDS + manager, security, status; + DECnet, Ethernet, IP, OSI, ring, star; + NMS, SNMP, X; + DOS, UNIX, VMS; + sourcelib. + + ABSTRACT + Xnetperfmon is a XNETMON tool used to produce plots of + SNMP variables in graphical displays. The manager may + easily customize the labels, step size, update interval, + and variables to be plotted to produce graphs for fault + and performance management. Scales automatically adjust + whenever a point to be plotted would go off scale. + + MECHANISM + The xnetperfmon application communicates with remote + agents or proxy agents via the Simple Network Management + Protocol (SNMP). + + CAVEATS + All plots for a single invocation of xnetperfmon must be + for variables provided by a single network management + agent. However, multiple invocations of xnetperfmon may + be active on a single display simultaneously or proxy + agents may be used to summarize information at a common + point. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Systems supporting X windows. + + SOFTWARE REQUIRED + XNETMON from SNMP Research and X Version 11 release 4 or + later (option MOTIF) + + + + + +NOCTools2 Working Group [Page 175] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + AVAILABILITY AND CONTACT POINT FOR INFORMATION ABOUT THIS TOOL + This is a commercial product available under license + from: + + SNMP Research + 3001 Kimberlin Heights Road + Knoxville, TN 37920-9716 + Attn: John Southwood, Sales and Marketing + (615) 573-1434 (Voice) (615) 573-9197 (FAX) + + CONTACT POINT FOR CHANGES TO THIS CATALOG ENTRY + users@seymour1.cs.utk.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 176] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + Internet Tool Catalog XUP_HP + + NAME + xup + + KEYWORDS + status; ping, X; HP. + + ABSTRACT + Xup uses the X-Windows to display the status of an + "interesting" set of hosts. + + MECHANISM + Xup uses ping to determine host status. + + CAVEATS + Polling for status increases network load. + + BUGS + None known. + + LIMITATIONS + None reported. + + HARDWARE REQUIRED + Runs only on HP series 300 and 800 workstations. + + SOFTWARE REQUIRED + Version 10 of X-Windows. + + AVAILABILITY + A standard command for the HP 300 & 800 Workstations. + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 177] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +Appendix: "No-Writeups" + + This section contains references to tools which are known to exist, + but which have not been fully cataloged. If anyone wishes to author + an entry for one of these tools please contact: noctools- + request@merit.edu. + + Each mention is separated by a for improved readability. + If you intend to actually print-out this section of the catalog, then + you should probably strip-out the . + +tuecho.c + +/* + * Send / receive TCP or UDP echos in any of a number of bizzare ways. + * + * Joel P. Bion, March 1990 + * Copyright (c) 1990 cisco Systems. All rights reserved. + * + * This "tuecho" program is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Prompts as: + * Host: -- host to send echos to -- can be name or a.b.c.d -- + * Enter protocol (0 = UDP, 1 = TCP) [0]: -- UDP or TCP + * Size of data portion (bytes) [100]: -- bytes in data, excluding + * headers -- Number of bursts [5]: -- number of bursts of packets to + * send -- Packets per burst [1]: -- packets per burst, all sent AT + * ONCE -- Timeout (seconds) [2]: -- how long to wait for data + * Pause interval (seconds) [0]: -- Pause interval between bursts of + * frames + * Type of pattern (specify = 0, increment = 1) [1]: + * -- if 0 specified, allow you to specify a 16bit pattern + -- as four hex digits (see below). If 1, will create a + -- "incrementing", cycling pattern from 0x0000 -> 0xffff + -- ->. + * Enter pattern (hex value) [abcd]: -- if "0" specified above + */ + +Availability: + ftp.uu.net:/networking/cisco/tuecho.c + ftp.cisco.com:tuecho.c + + + + + + + + +NOCTools2 Working Group [Page 178] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +SPY An NFS monitoring/tracing tool + +Availability: + A postscript file describing SPY is located on + ftp.uu.net:/networking/ip/nfs/spy.ps.Z + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 179] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +NFSTRACE + + This is the rpcspy/nfstrace package. + + It is described in detail in the paper "NFS Tracing by Passive + Network Monitoring", which appeared in the January, 1992 USENIX + conference. + + You'll need either a DEC machine running ULTRIX (with the + packetfilter installed in the kernel) or a Sun running SunOS 4.x + (with NIT). Or you'll need to do a bit of hacking. + + The package differs slightly from the version in the paper: + + + - The handle->name translation facility has been removed. It's + just too fragile to include in the general release. If you need it, + contact me directly and I'll be happy to mail you the code. + + - The output format is a wee-bit different. + + - The IBM-RT Enet filter version is also not included, since I seem to + be the only person in the world running it. RTs are really too slow + for this anyway. + + To configure the package, edit the makefile in the obvious (to me at + least) way. + + Note that the not all versions of SunOS NIT have working versions of + the packet timestamp mechanism. Try to set the -DSTAMPS option in + the makefile, and if that doesn't work, take it out. + + If you are actually going to use this to gather traces, I'd like to + hear from you! Please send email, and share your results/traces if + your organization will allow it. I maintain a mailing list of users + for updates, etc. Send me mail to be added to it. + + Happy tracing. + Matt Blaze + Department of Computer Science + Princeton University + 35 Olden Street + Princeton, NJ 08544 + mab@cs.princeton.edu + 609-258-3946 + + Availability: + ftp.uu.net:/networking/ip/nfs/nfstrace.shar (or check archie) + + + +NOCTools2 Working Group [Page 180] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + LAMER + + # Lame delegation notifier + # Author: Bryan Beecher + # Last Modified: 6/25/92 + # + # To make use of this software, you need to be running the + # University of Michigan release of BIND 4.8.3, or any version + # of named that supports the LAME_DELEGATION patches posted to + # USENET. The U-M release is available via anonymous ftp from + # terminator.cc.umich.edu:/unix/dns/bind4.8.3.tar.Z. + # + # You must also have a copy of query(1) and host(1). These + # are also available via anonymous ftp in the aforementioned + # place. + # ------------------------------------------------------------- + + # ------------------------------------------------------------- + # handle arguments + # ------------------------------------------------------------- + # -d + # This flag is used to append a dot-day suffix to the LOGFILE. + # Handy where log files are kept around for the last week + # and contain a day suffix. + # + # -f + # Change the LOGFILE value altogether. + # + # -w + # Count up all of the DNS statistics for the whole week. + # + # -v + # Be verbose. + # + # -t + # Test mode. Do not send mail to the lame delegation + # hostmasters. + + Availability: + ftp.uu.net:/networking/ip/dns/lamer.tar.Z (or check archie) + + + + + + + + + + + +NOCTools2 Working Group [Page 181] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + HOST + + host - look up host names using domain server + +SYNOPSIS + host [-v] [-a] [-t querytype] [options] name [server] + host [-v] [-a] [-t querytype] [options] -l domain [server] + host [-v] [options] -H [-D] [-E] [-G] domain + host [-v] [options] -C domain + host [-v] [options] -A host + +DESCRIPTION + host looks for information about Internet hosts or domains. + It gets this information from a set of interconnected + servers that are spread across the world. By default, it + simply converts between host names and Internet addresses. + However, with the -t, -a and -v options, it can be used to + find all of the information about hosts or domains that is + maintained by the domain nameserver. + +/* + * Extensively modified by E. Wassenaar, Nikhef-H, + * + * The officially maintained source of this program is available + * via anonymous ftp from machine 'ftp.nikhef.nl' [192.16.199.1] + * in the directory '/pub/network' as 'host.tar.Z' + * + * Also available in this directory are patched versions of the + * BIND 4.8.3 nameserver and resolver library which you may need + * to fully exploit the features of this program, although they + * are not mandatory. See the file 'README_FIRST' for details. + * + * You are kindly requested to report bugs and make suggestions + * for improvements to the author at the given email address, + * and to not re-distribute your own modifications to others. + */ +/* + * New features + * + * - Major overhaul of the whole code. + * - Very rigid error checking, with more verbose error messages. + * - Zone listing section completely rewritten. + * - It is now possible to do recursive listings into subdomains. + * - Maintain resource record statistics during zone listings. + * - Maintain count of hosts during zone listings. + * - Exploit multiple server addresses if available. + * - Option to exploit only primary server for zone transfers. + * - Option to exclude info from names that do not reside in a domain. + + + +NOCTools2 Working Group [Page 182] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + * - Implement timeout handling during connect and read. + * - Write resource record output to optional logfile. + * - Special MB tracing by recursively expanding MR and MG records. + * - Special mode to check SOA records at each nameserver for domain. + * - Special mode to check inverse mappings of host addresses. + * - Code is extensively documented. + */ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 183] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +PINGs + +Many many versions of the PING program exist. +Each implementation has its own set of additional features. +Here are a few more PINGs that are worth taking a look at. + +Version on ftp.cc.berkeley.edu:pub/ping: + This version has duplicate packet detection, Record Route, + ability to specify data pattern for packets, flood pinging, an + interval option, Multicast support, etc. + +Version on nikhefh.nikhef.nl:/pub/network/rping.tar.Z: + 'rping' is just like 'ping', but only a single probe packet + is sent to test the reachability of a destination. + As an option, the loose source routing facility is used + to show the roundtrip route the packet has taken. + Multiple addresses of remote hosts are tried until one + responds. As an option, each of multiple addresses can be + probed unconditionally. + Contains a patch for making loose source routing work in + case you have a SUN with an OMNINET ethernet controller. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 184] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +VRFY + +vrfy.tar.Z (Version 921021) + 'vrfy' is a tool to verify email addresses and mailing lists. + In its simplest form it takes an address "user@domain", figures + out the MX hosts for "domain", and issues the SMTP command VRFY + at the primary MX host (optionally all), or at "domain" itself + if no MX hosts exist. Without "domain" it goes to "localhost". + More complex capabilities are: recursively expanding forward + files or mailing lists, and detecting mail forwarding loops. + Full-blown RFC822 address specifications are understood. + Syntax checking can be carried out either locally or remotely. + Various options are provided to exploit alternative protocol + suites if necessary, and to print many forms of verbose output. + Obvious limitations exist, but on average it works pretty well. + Needless to say you need internet (nameserver and SMTP) access. + See the man page and the extensive documentation in the source + for further details. + +Please send comments and suggestions to Eric Wassenaar + +If you want to receive notification of updates, please send an email +with the keyword "subscribe" in the subject or the body to the address + + +available as: nikhefh.nikhef.nl:/pub/network/vrfy.tar.Z + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 185] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +XNETLOAD + +NAME + xnetload - ethernet load average display for X + +SYNOPSIS + xnetload[-toolkitoption ...] [-scale integer] + [-update seconds] [-hl color] [-highlight color] + [-jumpscroll pixels] [-label string] [-nolabel] host + +DESCRIPTION + The xnetload program displays a periodically updating histo- + gram of the ethernet load average for the specified host. + The resulting graph is scaled as 0% to 100%, where 0% + corresponds to 0mbs and 100% corresponds to 10mbs. NOTE: + The specified host must be running rpc.etherd. + +This program has been run using X11R4 and X11R5, under the following +operating systems: + + SUNOS 4.1.0 + SUNOS 4.1.1 + ULTRIX V4.2 + IRIX 3.3.2 + +Assuming the Imake templates and Rules are in order and in the proper +place on your system, these programs should compile and link +straightforward by running the following sequence: + + xmkmf + make + +Then, as root, issue the following: + + make install + make install.man + +Then, on your host system, (or on any other system you can rlogin or rsh +into) start the etherd daemon with the following (must be root): + + /usr/etc/rpc.etherd le0 & + +where le0 is the mnemonic for the primary ethernet interface. + +To start the xnetload program, the following command line is suggested: + + ./xnetload -hl red host & + + + + +NOCTools2 Working Group [Page 186] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +where "host" is the name of any reachable network node (including +LOCALHOST) that is running the etherd daemon. A small xload window +should appear on your local display with nine horizontal lines. The +label: + "Ethernet Load %" +should appear in the upper left hand corner, just below any additional +title bars or other decorations provided by your window manager. If the +program comes up without the nine lines, or without the "Ethernet Load" +label, then either your resource file is not properly installed in the +appropriate app-defaults directory, or you may have picked up the wrong +xnetload image. Try re-running "make install" as root, or be sure to +include the "./" in front of the command name. + +Good Luck! + +The following changes have been made to this directory since R3: + + o Now use Athena StripChart widget. + + o Understands WM_DELETE_WINDOW. + + o 3-26-92 Modified from xload to xnetload by Roger Smith, + Sterling Software at NASA-Ames Research Center, + Mountain View, Calif. rsmith@proteus.arc.nasa.gov + +Availability: + ftp proteus.arc.nasa.gov:pub/XEnetload.tar.Z (or check archie) + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 187] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +NETTEST + + nettest, nettestd - Performs client and server functions for + timing data throughput + + The nettest and nettestd commands invoke client and server + programs that are used for timing data throughput of various + methods of interprocess communication. For TCP and OSI con- + nections, the nettest program establishes a connection with + the nettestd program, and then it does count writes of size + bytes, followed by count reads of size bytes. For UDP, the + nettest program performs only writes; reads are not per- + formed. The nettestd program, if used with UDP connections, + reads the data packets and prints a message for each data + packet it receives. The number and size of the reads and + writes may not correlate with the number and size of the + actual data packets that are transferred; it depends on the + protocol that is chosen. If you append an optional k (or K) + to the size, count, or bufsize value, the number specified + is multiplied by 1024. + + This source for nettest and nettestd are provided on an "as is" + basis. Cray Research does not provide any support for this code + (unless you are a customer who has purchased the UNICOS operating + system). + + We will gladly take bug reports for nettest/nettestd. Suggested + fixes are prefered to just bug reports. Changes to allow + nettest/nettestd to run on other architectures are also welcomed. We + will try to incorporate bugfixes and update the publicly available + code, but we can make no guarantees. + + For copyright information, see the notice in each source file. + + Send bug-reports/fixes to: + E-mail: dab@cray.com + U.S. Mail: David Borman + Cray Research, Inc. + 655F Lone Oak Drive + Eagan, MN 55121 + Notes: + + 1) The -b option to nettestd has not been tested... + 2) The ISO code should work on a 4.4BSD system, but the + gethostinfo() routine is specific to UNICOS... + + Availability: + ftp sgi.com:/sgi/src/nettest + + + +NOCTools2 Working Group [Page 188] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + ETHERCK + + etherck is a simple program that displays Sun ethernet statistics. + If you have a high percents of input errors that are due to "out of + buffers", then you can run the "iepatch" script to patch a kernel + that uses the Intel ethernet chip ("ie"). A back of the envelope + calculation shows that a .25% input error rate gives about a 10% + degradation of NFS performance if 8k packets are being used. + + In our environment at Legato, patching the ie buffer allocation made + the input error rate drop more than 2 orders of magnitude. This was + after we had applied other networking fixes (e.g., using Prestoserve, + going from thin wire to twisted pair) and pushed a higher load on the + server. + + Note that both etherck and iepatch must be run by root (or you can + make etherck setgid kmem). + + Availability: + send EMAIL to: request@legato.com + with a Subject line: send unsupported etherck + + The following is part of the 'help' file from the Legato Email + Server: + + This message comes to you from the request server at Legato.COM, + request@Legato.COM. It received a message from you asking for help. + + The request server is a mail-response program. That means that you + mail it a request, and it mails back the response. + + The request server is a very dumb program. It does not have much + error checking. If you don't send it the commands that it + understands, it will just answer "I don't understand you". + + The request server has 4 commands. Each command must be the first + word on a line. The request server reads your entire message before + it does anything, so you can have several different commands in a + single message. The request server treats the "Subject:" header line + just like any other line of the message. You can use any combination + of upper and lower case letters in the commands. + + The request server's files are organized into a series of directories + and subdirectories. Each directory has an index, and each + subdirectory has an index. The top-level index gives you an overview + of what is in the subdirectories, and the index for each subdirectory + tells you what is in it. + + + + +NOCTools2 Working Group [Page 189] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + The server has 4 commands: + + "help" command: The command "help" or "send help" causes the server to + send you the help file. You already know this, of course, + because you are reading the help file. No other commands are + honored in a message that asks for help (the server figures + that you had better read the help message before you do + anything else). + + SEND a request to Legato to get the rest of the help file! + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 190] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + + NETCK + + netck is a shar file that contains the sources to build "netck", a + network checker that uses the rstat(3R) protocol to gather and print + statistics from machines on the network. netck is useful to help + understand what part of what machines are potential NFS bottlenecks. + To get this file, send email to the request server with the command + "send unsupported netck". + + Availability: + same as ETHERCK (send email To: request@legato.com; subject: + HELP) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 191] + +RFC 1470 FYI: Network Management Tool Catalog June 1993 + + +References + + [1] Stine, R., Editor, "FYI on a Network Management Tool Catalog: + Tools for Monitoring and Debugging TCP/IP Internets and + Interconnected Devices", FYI 2, RFC 1147, Sparta, Inc., April + 1990. + +Security Considerations + + Security issues are not discussed in this memo. + +Authors' Addresses + + Robert M. Enger + Advanced Network and Services + 1875 Campus Commons Drive, Suite 220 + Reston, VA. 22091-1552 + + Phone: 703-758-7722 + EMail: enger@reston.ans.net + + + Joyce K. Reynolds + Information Sciences Institute + University of Southern California + 4676 Admiralty Way + Marina del Rey, CA 90292 + + Phone: (310) 822-1511 + Email: JKREY@ISI.EDU + + + + + + + + + + + + + + + + + + + + + +NOCTools2 Working Group [Page 192] + \ No newline at end of file diff --git a/ext/picotcp/RFC/rfc1644.txt b/ext/picotcp/RFC/rfc1644.txt new file mode 100644 index 0000000..2aca5a6 --- /dev/null +++ b/ext/picotcp/RFC/rfc1644.txt @@ -0,0 +1,2131 @@ + + + + + + +Network Working Group R. Braden +Request for Comments: 1644 ISI +Category: Experimental July 1994 + + T/TCP -- TCP Extensions for Transactions + Functional Specification + +Status of this Memo + + This memo describes an Experimental Protocol for the Internet + community, and requests discussion and suggestions for improvements. + It does not specify an Internet Standard. Distribution is unlimited. + +Abstract + + This memo specifies T/TCP, an experimental TCP extension for + efficient transaction-oriented (request/response) service. This + backwards-compatible extension could fill the gap between the current + connection-oriented TCP and the datagram-based UDP. + + This work was supported in part by the National Science Foundation + under Grant Number NCR-8922231. + +Table of Contents + + 1. INTRODUCTION .................................................. 2 + 2. OVERVIEW ..................................................... 3 + 2.1 Bypassing the Three-Way Handshake ........................ 4 + 2.2 Transaction Sequences .................................... 6 + 2.3 Protocol Correctness ..................................... 8 + 2.4 Truncating TIME-WAIT State ............................... 12 + 2.5 Transition to Standard TCP Operation ..................... 14 + 3. FUNCTIONAL SPECIFICATION ..................................... 17 + 3.1 Data Structures .......................................... 17 + 3.2 New TCP Options .......................................... 17 + 3.3 Connection States ........................................ 19 + 3.4 T/TCP Processing Rules ................................... 25 + 3.5 User Interface ........................................... 28 + 4. IMPLEMENTATION ISSUES ........................................ 30 + 4.1 RFC-1323 Extensions ...................................... 30 + 4.2 Minimal Packet Sequence .................................. 31 + 4.3 RTT Measurement .......................................... 31 + 4.4 Cache Implementation ..................................... 32 + 4.5 CPU Performance .......................................... 32 + 4.6 Pre-SYN Queue ............................................ 33 + 6. ACKNOWLEDGMENTS .............................................. 34 + 7. REFERENCES ................................................... 34 + APPENDIX A. ALGORITHM SUMMARY ................................... 35 + + + +Braden [Page 1] + +RFC 1644 Transaction/TCP July 1994 + + + Security Considerations .......................................... 38 + Author's Address ................................................. 38 + +1. INTRODUCTION + + TCP was designed to around the virtual circuit model, to support + streaming of data. Another common mode of communication is a + client-server interaction, a request message followed by a response + message. The request/response paradigm is used by application-layer + protocols that implement transaction processing or remote procedure + calls, as well as by a number of network control and management + protocols (e.g., DNS and SNMP). Currently, many Internet user + programs that need request/response communication use UDP, and when + they require transport protocol functions such as reliable delivery + they must effectively build their own private transport protocol at + the application layer. + + Request/response, or "transaction-oriented", communication has the + following features: + + (a) The fundamental interaction is a request followed by a response. + + (b) An explicit open or close phase may impose excessive overhead. + + (c) At-most-once semantics is required; that is, a transaction must + not be "replayed" as the result of a duplicate request packet. + + (d) The minimum transaction latency for a client should be RTT + + SPT, where RTT is the round-trip time and SPT is the server + processing time. + + (e) In favorable circumstances, a reliable request/response + handshake should be achievable with exactly one packet in each + direction. + + This memo concerns T/TCP, an backwards-compatible extension of TCP to + provide efficient transaction-oriented service in addition to + virtual-circuit service. T/TCP provides all the features listed + above, except for (e); the minimum exchange for T/TCP is three + segments. + + In this memo, we use the term "transaction" for an elementary + request/response packet sequence. This is not intended to imply any + of the semantics often associated with application-layer transaction + processing, like 3-phase commits. It is expected that T/TCP can be + used as the transport layer underlying such an application-layer + service, but the semantics of T/TCP is limited to transport-layer + services such as reliable, ordered delivery and at-most-once + + + +Braden [Page 2] + +RFC 1644 Transaction/TCP July 1994 + + + operation. + + An earlier memo [RFC-1379] presented the concepts involved in T/TCP. + However, the real-world usefulness of these ideas depends upon + practical issues like implementation complexity and performance. To + help explore these issues, this memo presents a functional + specification for a particular embodiment of the ideas presented in + RFC-1379. However, the specific algorithms in this memo represent a + later evolution than RFC-1379. In particular, Appendix A in RFC-1379 + explained the difficulties in truncating TIME-WAIT state. However, + experience with an implementation of the RFC-1379 algorithms in a + workstation later showed that accumulation of TCB's in TIME-WAIT + state is an intolerable problem; this necessity led to a simple + solution for truncating TIME-WAIT state, described in this memo. + + Section 2 introduces the T/TCP extensions, and section 3 contains the + complete specification of T/TCP. Section 4 discusses some + implementation issues, and Appendix A contains an algorithmic + summary. This document assumes familiarity with the standard TCP + specification [STD-007]. + +2. OVERVIEW + + The TCP protocol is highly symmetric between the two ends of a + connection. This symmetry is not lost in T/TCP; for example, T/TCP + supports TCP's symmetric simultaneous open from both sides (Section + 2.3 below). However, transaction sequences use T/TCP in a highly + unsymmetrical manner. It is convenient to use the terms "client + host" and "server host" for the host that initiates a connection and + the host that responds, respectively. + + The goal of T/TCP is to allow each transaction, i.e., each + request/response sequence, to be efficiently performed as a single + incarnation of a TCP connection. Standard TCP imposes two + performance problems for transaction-oriented communication. First, + a TCP connection is opened with a "3-way handshake", which must + complete successfully before data can be transferred. The 3-way + handshake adds an extra RTT (round trip time) to the latency of a + transaction. + + The second performance problem is that closing a TCP connection + leaves one or both ends in TIME-WAIT state for a time 2*MSL, where + MSL is the maximum segment lifetime (defined to be 120 seconds). + TIME-WAIT state severely limits the rate of successive transactions + between the same (host,port) pair, since a new incarnation of the + connection cannot be opened until the TIME-WAIT delay expires. RFC- + 1379 explained why the alternative approach, using a different user + port for each transaction between a pair of hosts, also limits the + + + +Braden [Page 3] + +RFC 1644 Transaction/TCP July 1994 + + + transaction rate: (1) the 16-bit port space limits the rate to + 2**16/240 transactions per second, and (2) more practically, an + excessive amount of kernel space would be occupied by TCP state + blocks in TIME-WAIT state [RFC-1379]. + + T/TCP solves these two performance problems for transactions, by (1) + bypassing the 3-way handshake (3WHS) and (2) shortening the delay in + TIME-WAIT state. + + 2.1 Bypassing the Three-Way Handshake + + T/TCP introduces a 32-bit incarnation number, called a "connection + count" (CC), that is carried in a TCP option in each segment. A + distinct CC value is assigned to each direction of an open + connection. A T/TCP implementation assigns monotonically + increasing CC values to successive connections that it opens + actively or passively. + + T/TCP uses the monotonic property of CC values in initial + segments to bypass the 3WHS, using a mechanism that we call TCP + Accelerated Open (TAO). Under the TAO mechanism, a host caches a + small amount of state per remote host. Specifically, a T/TCP host + that is acting as a server keeps a cache containing the last valid + CC value that it has received from each different client host. If + an initial segment (i.e., a segment containing a SYN bit but + no ACK bit) from a particular client host carries a CC value + larger than the corresponding cached value, the monotonic property + of CC's ensures that the segment must be new and can + therefore be accepted immediately. Otherwise, the server host + does not know whether the segment is an old duplicate or was + simply delivered out of order; it therefore executes a normal 3WHS + to validate the . Thus, the TAO mechanism provides an + optimization, with the normal TCP mechanism as a fallback. + + The CC value carried in non- segments is used to protect + against old duplicate segments from earlier incarnations of the + same connection (we call such segments 'antique duplicates' for + short). In the case of short connections (e.g., transactions), + these CC values allow TIME-WAIT state delay to be safely discuss + in Section 2.3. + + T/TCP defines three new TCP options, each of which carries one + 32-bit CC value. These options are named CC, CC.NEW, and CC.ECHO. + The CC option is normally used; CC.NEW and CC.ECHO have special + functions, as follows. + + + + + + +Braden [Page 4] + +RFC 1644 Transaction/TCP July 1994 + + + (a) CC.NEW + + Correctness of the TAO mechanism requires that clients + generate monotonically increasing CC values for successive + connection initiations. These values can be generated using + a simple global counter. There are certain circumstances + (discussed below in Section 2.2) when the client knows that + monotonicity may be violated; in this case, it sends a CC.NEW + rather than a CC option in the initial segment. + Receiving a CC.NEW causes the server to invalidate its cache + entry and do a 3WHS. + + (b) CC.ECHO + + When a server host sends a segment, it echoes the + connection count from the initial in a CC.ECHO option, + which is used by the client host to validate the + segment. + + Figure 1 illustrates the TAO mechanism bypassing a 3WHS. The + cached CC values, denoted by cache.CC[host], are shown on each + side. The server host compares the new CC value x in segment #1 + against x0, its cached value for client host A; this comparison is + called the "TAO test". Since x > x0, the must be new and + can be accepted immediately; the data in the segment can therefore + be delivered to the user process B, and the cached value is + updated. If the TAO test failed (x <= x0), the server host would + do a normal three-way handshake to validate the segment, but + the cache would not be updated. + + + + + + + + + + + + + + + + + + + + + + +Braden [Page 5] + +RFC 1644 Transaction/TCP July 1994 + + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + cache.CC[A] + V + + [ x0 ] + + #1 --> --> (TAO test OK (x > x0) => + data1->user_B and + cache.CC[A]= x; ) + + [ x ] + #2 <-- <-- + (data2->user_A;) + + + Figure 1. TAO: Three-Way Handshake is Bypassed + + + The CC value x is echoed in a CC.ECHO option in the + segment (#2); the client side uses this option to validate the + segment. Since segment #2 is valid, its data2 is delivered to the + client user process. Segment #2 also carries B's CC value; this + is used by A to validate non-SYN segments from B, as explained in + Section 2.4. + + Implementing the T/TCP extensions expands the connection control + block (TCB) to include the two CC values for the connection; call + these variables TCB.CCsend and TCB.CCrecv (or CCsend, CCrecv for + short). For example, the sequence shown in Figure 1 sets + TCB.CCsend = x and TCB.CCrecv = y at host A, and vice versa at + host B. Any segment that is received with a CC option containing + a value SEG.CC different from TCB.CCsend will be rejected as an + antique duplicate. + + 2.2 Transaction Sequences + + T/TCP applies the TAO mechanism described in the previous section + to perform a transaction sequence. Figure 2 shows a minimal + transaction, when the request and response data can each fit into + a single segment. This requires three segments and completes in + one round-trip time (RTT). If the TAO test had failed on segment + #1, B would have queued data1 and the FIN for later processing, + and then it would have returned a segment to A, to + perform a normal 3WHS. + + + + +Braden [Page 6] + +RFC 1644 Transaction/TCP July 1994 + + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + #1 SYN-SENT* --> --> CLOSE-WAIT* + (TAO test OK) + (data1->user_B) + + <-- LAST-ACK* + #2 TIME-WAIT <-- + (data2->user_A) + + + #3 TIME-WAIT --> --> CLOSED + + (timeout) + CLOSED + + Figure 2: Minimal T/TCP Transaction Sequence + + + T/TCP extensions require additional connection states, e.g., the + SYN-SENT*, CLOSE-WAIT*, and LAST-ACK* states shown in Figure 2. + Section 3.3 describes these new connection states. + + To obtain the minimal 3-segment sequence shown in Figure 2, the + server host must delay acknowledging segment #1 so the response + may be piggy-backed on segment #2. If the application takes + longer than this delay to compute the response, the normal TCP + retransmission mechanism in TCP B will send an acknowledgment to + forestall a retransmission from TCP A. Figure 3 shows an example + of a slow server application. Although the sequence in Figure 3 + does contain a 3-way handshake, the TAO mechanism has allowed the + request data to be accepted immediately, so that the client still + sees the minimum latency. + + + + + + + + + + + + + + +Braden [Page 7] + +RFC 1644 Transaction/TCP July 1994 + + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + #1 SYN-SENT* --> --> CLOSE-WAIT* + (TAO test OK => + data1->user_B) + + (timeout) + #2 FIN-WAIT-1 <-- <-- CLOSE-WAIT* + + + #3 FIN-WAIT-1 --> --> CLOSE-WAIT + + + #4 TIME-WAIT <-- <-- LAST-ACK + (data2->user_A) + + #5 TIME_WAIT --> --> CLOSED + + (timeout) + CLOSED + + Figure 3: Acknowledgment Timeout in Server + + + 2.3 Protocol Correctness + + This section fills in more details of the TAO mechanism and + provides an informal sketch of why the T/TCP protocol works. + + CC values are 32-bit integers. The TAO test requires the same + kind of modular arithmetic that is used to compare two TCP + sequence numbers. We assume that the boundary between y < z and z + < y for two CC values y and z occurs when they differ by 2**31, + i.e., by half the total CC space. + + The essential requirement for correctness of T/TCP is this: + + CC values must advance at a rate slower than 2**31 [R1] + counts per 2*MSL + + where MSL denotes the maximum segment lifetime in the Internet. + The requirement [R1] is easily met with a 32-bit CC. For example, + it will allow 10**6 transactions per second with the very liberal + MSL of 1000 seconds [RFC-1379]. This is well in excess of the + + + +Braden [Page 8] + +RFC 1644 Transaction/TCP July 1994 + + + transaction rates achievable with current operating systems and + network latency. + + Assume for the present that successive connections from client A + to server B contain only monotonically increasing CC values. That + is, if x(i) and x(i+1) are CC values carried in two successive + initial segments from the same host, then x(i+1) > x(i). + Assuming the requirement [R1], the CC space cannot wrap within the + range of segments that can be outstanding at one time. Therefore, + those successive segments from a given host that have not + exceeded their MSL must contain an ordered set of CC values: + + x(1) < x(2) < x(3) ... < x(n), + + where the modular comparisons have been replaced by simple + arithmetic comparisons. Here x(n) is the most recent acceptable + , which is cached by the server. If the server host receives + a segment containing a CC option with value y where y > + x(n), that must be newer; an antique duplicate SYN with CC + value greater than x(n) must have exceeded its MSL and vanished. + Hence, monotonic CC values and the TAO test prevent erroneous + replay of antique s. + + There are two possible reasons for a client to generate non- + monotonic CC values: (a) the client may have crashed and + restarted, causing the generated CC values to jump backwards; or + (b) the generated CC values may have wrapped around the finite + space. Wraparound may occur because CC generation is global to + all connections. Suppose that host A sends a transaction to B, + then sends more than 2**31 transactions to other hosts, and + finally sends another transaction to B. From B's viewpoint, CC + will have jumped backward relative to its cached value. + + In either of these two cases, the server may see the CC value jump + backwards only after an interval of at least MSL since the last + segment from the same client host. In case (a), client host + restart, this is because T/TCP retains TCP's explicit "Quiet Time" + of an MSL interval [STD-007]. In case (b). wrap around, [R1] + ensures that a time of at least MSL must have passed before the CC + space wraps around. Hence, there is no possibility that a TAO + test will succeed erroneously due to either cause of non- + monotonicity; i.e., there is no chance of replays due to TAO. + + However, although CC values jumping backwards will not cause an + error, it may cause a performance degradation due to unnecessary + 3WHS's. This results from the generated CC values jumping + backwards through approximately half their range, so that all + succeeding TAO tests fail until the generated CC values catch up + + + +Braden [Page 9] + +RFC 1644 Transaction/TCP July 1994 + + + to the cached value. To avoid this degradation, a client host + sends a CC.NEW option instead of a CC option in the case of either + system restart or CC wraparound. Receiving CC.NEW forces a 3WHS, + but when this 3WHS completes successfully the server cache is + updated to the new CC value. To detect CC wraparound, the client + must cache the last CC value it sent to each server. It therefore + maintains cache.CCsent[B] for each server B. If this cached value + is undefined or if it is larger than the next CC value generated + at the client, then the client sends a CC.NEW instead of a CC + option in the next SYN segment. + + This is illustrated in Figure 4, which shows the scenario for the + first transaction from A to B after the client host A has crashed + and recovered. A similar sequence occurs if x is not greater than + cache.CCsent[B], i.e., if there is a wraparound of the generated + CC values. Because segment #1 contains a CC.NEW option, the + server host invalidates the cache entry and does a 3WHS; however, + it still sets B's TCB.CCrecv for this connection to x. TCP B uses + this CCrecv value to validate the segment (#3) that + completes the 3WHS. Receipt of this segment updates cache.CC[A], + since the cache entry was previously undefined. (If a 3WHS always + updated the cache, then out-of-order SYN segments could cause the + cached value to jump backwards, possibly allowing replays). + Finally, the CC.ECHO option in the segment #2 defines + A's cache.CCsent entry. + + This algorithm delays updating cache.CCsent[] until the has + been ACK'd. This allows the undefined cache.CCsent value to used + as a a "first-time switch" to reliable resynchronization of the + cached value at the server after a crash or wraparound. + + When we use the term "cache", we imply that the value can be + discarded at any time without introducing erroneous behavior + although it may degrade performance. + + (a) If a server host receives an initial from client A but + has no cached value cache.CC[A], the server simply forces a + 3WHS to validate the segment. + + (b) If a client host has no cached value cache.CCsent[B] when it + needs to send an initial segment, the client simply + sends a CC.NEW option in the segment. This forces a 3WHS at + the server. + + + + + + + + +Braden [Page 10] + +RFC 1644 Transaction/TCP July 1994 + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + cache.CCsent[B] cache.CC[A] + V V + + (Crash and restart) + [ ?? ] [ x0 ] + + #1 --> --> (invalidate cache; + queue data1; + 3-way handshake) + + [ ?? ] [ ?? ] + #2 <-- <-- + (cache.CCsent[B]= x;) + + [ x ] [ ?? ] + + #3 --> --> data1->user_B; + cache.CC[A]= x; + + [ x ] [ x ] + + Figure 4. Client Host Restarting + + + So far, we have considered only correctness of the TAO mechanism + for bypassing the 3WHS. We must also protect a connection against + antique duplicate non-SYN segments. In standard TCP, such + protection is one of the functions of the TIME-WAIT state delay. + (The other function is the TCP full-duplex close semantics, which + we need to preserve; that is discussed below in Section 2.5). In + order to achieve a high rate of transaction processing, it must be + possible to truncate this TIME-WAIT state delay without exposure + to antique duplicate segments [RFC-1379]. + + For short connections (e.g., transactions), the CC values assigned + to each direction of the connection can be used to protect against + antique duplicate non-SYN segments. Here we define "short" as a + duration less than MSL. Suppose that there is a connection that + uses the CC values TCB.CCsend = x and TCB.CCrecv = y. By the + requirement [R1], neither x nor y can be reused for a new + connection from the same remote host for a time at least 2*MSL. + If the connection has been in existence for a time less than MSL, + then its CC values will not be reused for a period that exceeds + MSL, and therefore all antique duplicates with that CC value must + vanish before it is reused. Thus, for "short" connections we can + + + +Braden [Page 11] + +RFC 1644 Transaction/TCP July 1994 + + + guard against antique non-SYN segments by simply checking the CC + value in the segment againsts TCB.CCrecv. Note that this check + does not use the monotonic property of the CC values, only that + they not cycle in less than 2*MSL. Again, the quiet time at + system restart protects against errors due to crash with loss of + state. + + If the connection duration exceeds MSL, safety from old duplicates + still requires a TIME-WAIT delay of 2*MSL. Thus, truncation of + TIME-WAIT state is only possible for short connections. (This + problem has also been noticed by Shankar and Lee [ShankarLee93]). + This difference in behavior for long and for short connections + does create a slightly complex service model for applications + using T/TCP. An application has two different strategies for + multiple connections. For "short" connections, it should use a + fixed port pair and use the T/TCP mechanism to get rapid and + efficient transaction processing. For connections whose durations + are of the order of MSL or longer, it should use a different user + port for each successive connection, as is the current practice + with unmodified TCP. The latter strategy will cause excessive + overhead (due to TCB's in TIME-WAIT state) if it is applied to + high-frequency short connections. If an application makes the + wrong choice, its attempt to open a new connection may fail with a + "busy" error. If connection durations may range between long and + short, an application may have to be able to switch strategies + when one fails. + + 2.4 Truncating TIME-WAIT State + + Truncation of TIME-WAIT state is necessary to achieve high + transaction rates. As Figure 2 illustrates, a standard + transaction leaves the client end of the connection in TIME-WAIT + state. This section explains the protocol implications of + truncating TIME-WAIT state, when it is allowed (i.e., when the + connection has been in existence for less than MSL). In this + case, the client host should be able to interrupt TIME-WAIT state + to initiate a new incarnation of the same connection (i.e., using + the same host and ports). This will send an initial + segment. + + It is possible for the new to arrive at the server before + the retransmission state from the previous incarnation is gone, as + shown in Figure 5. Here the final (segment #3) from the + previous incarnation is lost, leaving retransmission state at B. + However, the client received segment #2 and thinks the transaction + completed successfully, so it can initiate a new transaction by + sending segment #4. When this arrives at the server + host, it must implicitly acknowledge segment #2, signalling + + + +Braden [Page 12] + +RFC 1644 Transaction/TCP July 1994 + + + success to the server application, deleting the old TCB, and + creating a new TCB, as shown in Figure 5. Still assuming that the + new is known to be valid, the server host marks the new + connection half-synchronized and delivers data3 to the server + application. (The details of how this is accomplished are + presented in Section 3.3.) + + The earlier discussion of the TAO mechanism assumed that the + previous incarnation was closed before a new arrived at the + server. However, TAO cannot be used to validate the if + there is still state from the previous incarnation, as shown in + Figure 5; in this case, it would be exceedingly awkward to perform + a 3WHS if the TAO test should fail. Fortunately, a modified + version of the TAO test can still be performed, using the state in + the earlier TCB rather than the cached state. + + (A) If the segment contains a CC or CC.NEW option, the + value SEG.CC from this option is compared with TCB.CCrecv, + the CC value in the still-existing state block of the + previous incarnation. If SEG.CC > TCB.CCrecv, the new + segment must be valid. + + (B) Otherwise, the is an old duplicate and is simply + discarded. + + Truncating TIME-WAIT state may be looked upon as composing an + extended state machine that joins the state machines of the two + incarnations, old and new. It may be described by introducing new + intermediate states (which we call I-states), with transitions + that join the two diagrams and share some state from each. I- + states are detailed in Section 3.3. + + Notice also segment #2' in Figure 5. TCP's mechanism to recover + from half-open connections (see Figure 10 of [STD-007]) cause TCP + A to send a RST when 2' arrives, which would incorrectly make B + think that the previous transaction did not complete successfully. + The half-open recovery mechanism must be defeated in this case, by + A ignoring segment #2'. + + + + + + + + + + + + + +Braden [Page 13] + +RFC 1644 Transaction/TCP July 1994 + + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + #1 --> <...,FIN,CC=x> --> LAST-ACK* + + #2 <-- <...ACK(FIN),data2,FIN,CC=y,CC.ECHO=x> <--- LAST-ACK* + TIME-WAIT + (data2->user_A) + + + #3 TIME-WAIT --> --> X (DROP) + + (New Active Open) (New Passive Open) + + #4 SYN-SENT* --> ... + + LISTEN-LA + #2' (discard) <-- <...ACK(FIN),data2,FIN,CC=y> <--- (retransmit) + + #4 SYN-SENT* ... --> ESTABLISHED* + SYN OK (see text) => + {Ack seg #2; + Delete old TCB; + Create new TCB; + data3 -> user_B; + cache.CC[A]= z;} + + Figure 5: Truncating TIME-WAIT State: SYN as Implicit ACK + + + 2.5 Transition to Standard TCP Operation + + T/TCP includes all normal TCP semantics, and it will continue to + operate exactly like TCP when the particular assumptions for + transactions do not hold. There is no limit on the size of an + individual transaction, and behavior of T/TCP should merge + seamlessly from pure transaction operation as shown in Figure 2, + to pure streaming mode for sending large files. All the sequences + shown in [STD-007] are still valid, and the inherent symmetry of + TCP is preserved. + + Figure 6 shows a possible sequence when the request and response + messages each require two segments. Segment #2 is a non-SYN + segment that contains a TCP option. To avoid compatibility + problems with existing TCP implementations, the client side should + + + +Braden [Page 14] + +RFC 1644 Transaction/TCP July 1994 + + + send segment #2 only if cache.CCsent[B] is defined, i.e., only if + host A knows that host B plays the new game. + + + + TCP A (Client) TCP B (Server) + _______________ ______________ + + CLOSED LISTEN + + + #1 SYN-SENT* --> --> ESTABLISHED* + (TAO test OK => + data1-> user) + + #2 SYN-SENT* --> --> CLOSE-WAIT* + (data2-> user) + + CLOSE-WAIT* + #3 FIN-WAIT-2 <-- <-- + (data3->user) + + #4 TIME_WAIT <-- <-- LAST-ACK* + (data4->user) + + #5 TIME-WAIT --> --> CLOSED + + + Figure 6. Multi-Packet Request/Response Sequence + + Figure 7 shows a more complex example, one possible sequence with + TAO combined with simultaneous open and close. This may be + compared with Figure 8 of [STD-007]. + + + + + + + + + + + + + + + + + + +Braden [Page 15] + +RFC 1644 Transaction/TCP July 1994 + + + + TCP A TCP B + _______________ ______________ + + CLOSED CLOSED + + #1 SYN-SENT* --> ... + + #2 CLOSING* <-- <-- SYN-SENT* + (TAO test OK => + data2->user_A + + #3 CLOSING* --> ... + + #1' ... --> CLOSING* + (TAO test OK => + data1->user_B) + + #4 TIME-WAIT <-- <-- CLOSING* + + #5 TIME-WAIT --> ... + + #3' ... --> TIME-WAIT + + #6 TIME-WAIT <-- <--- TIME-WAIT + + #5' TIME-WAIT ... --> TIME-WAIT + + (timeout) (timeout) + CLOSED CLOSED + + Figure 7: Simultaneous Open and Close + + + + + + + + + + + + + + + + + + + +Braden [Page 16] + +RFC 1644 Transaction/TCP July 1994 + + +3. FUNCTIONAL SPECIFICATION + + 3.1 Data Structures + + A connection count is an unsigned 32-bit integer, with the value + zero excluded. Zero is used to denote an undefined value. + + A host maintains a global connection count variable CCgen, and + each connection control block (TCB) contains two new connection + count variables, TCB.CCsend and TCB.CCrecv. Whenever a TCB is + created for the active or passive end of a new connection, CCgen + is incremented by 1 and placed in TCB.CCsend of the TCB; however, + if the previous CCgen value was 0xffffffff (-1), then the next + value should be 1. TCB.CCrecv is initialized to zero (undefined). + + T/TCP adds a per-host cache to TCP. An entry in this cache for + foreign host fh includes two CC values, cache.CC[fh] and + cache.CCsent[fh]. It may include other values, as discussed in + Sections 4.3 and 4.4. According to [STD-007], a TCP is not + permitted to send a segment larger than the default size 536, + unless it has received a larger value in an MSS (Maximum Segment + Size) option. This could constrain the client to use the default + MSS of 536 bytes for every request. To avoid this constraint, a + T/TCP may cache the MSS option values received from remote hosts, + and we allow a TCP to use a cached MSS option value for the + initial SYN segment. + + When the client sends an initial segment containing data, it + does not have a send window for the server host. This is not a + great difficulty; we simply define a default initial window; our + current suggestion is 4K. Such a non-zero default should be be + conditioned upon the existence of a cached connection count for + the foreign host, so that data may be included on an initial SYN + segment only if cache.CC[foreign host] is non-zero. + + In TCP, the window is dynamically adjusted to provide congestion + control/avoidance [Jacobson88]. It is possible that a particular + path might not be able to absorb an initial burst of 4096 bytes + without congestive losses. If this turns out to be a problem, it + should be possible to cache the congestion threshold for the path + and use this value to determine the maximum size of the initial + packet burst created by a request. + + 3.2 New TCP Options + + Three new TCP options are defined: CC, CC.NEW, and CC.ECHO. Each + carries a connection count SEG.CC. The complete rules for sending + and processing these options are given in Section 3.4 below. + + + +Braden [Page 17] + +RFC 1644 Transaction/TCP July 1994 + + + CC Option + + Kind: 11 + + Length: 6 + + +--------+--------+--------+--------+--------+--------+ + |00001011|00000110| Connection Count: SEG.CC | + +--------+--------+--------+--------+--------+--------+ + Kind=11 Length=6 + + This option may be sent in an initial SYN segment, and it may + be sent in other segments if a CC or CC.NEW option has been + received for this incarnation of the connection. Its SEG.CC + value is the TCB.CCsend value from the sender's TCB. + + CC.NEW Option + + Kind: 12 + + Length: 6 + + +--------+--------+--------+--------+--------+--------+ + |00001100|00000110| Connection Count: SEG.CC | + +--------+--------+--------+--------+--------+--------+ + Kind=12 Length=6 + + This option may be sent instead of a CC option in an initial + segment (i.e., SYN but not ACK bit), to indicate that the + SEG.CC value may not be larger than the previous value. Its + SEG.CC value is the TCB.CCsend value from the sender's TCB. + + CC.ECHO Option + + Kind: 13 + + Length: 6 + + +--------+--------+--------+--------+--------+--------+ + |00001101|00000110| Connection Count: SEG.CC | + +--------+--------+--------+--------+--------+--------+ + Kind=13 Length=6 + + This option must be sent (in addition to a CC option) in a + segment containing both a SYN and an ACK bit, if the initial + SYN segment contained a CC or CC.NEW option. Its SEG.CC value + is the SEG.CC value from the initial SYN. + + + + +Braden [Page 18] + +RFC 1644 Transaction/TCP July 1994 + + + A CC.ECHO option should be sent only in a segment and + should be ignored if it is received in any other segment. + + 3.3 Connection States + + T/TCP requires new connection states and state transitions. + Figure 8 shows the resulting finite state machine; see [RFC-1379] + for a detailed development. If all state names ending in stars + are removed from Figure 8, the state diagram reduces to the + standard TCP state machine (see Figure 6 of [STD-007]), with two + exceptions: + + * STD-007 shows a direct transition from SYN-RECEIVED to FIN- + WAIT-1 state when the user issues a CLOSE call. This + transition is suspect; a more accurate description of the + state machine would seem to require the intermediate SYN- + RECEIVED* state shown in Figure 8. + + * In STD-007, a user CLOSE call in SYN-SENT state causes a + direct transition to CLOSED state. The extended diagram of + Figure 8 forces the connection to open before it closes, + since calling CLOSE to terminate the request in SYN-SENT + state is normal behavior for a transaction client. In the + case that no data has been sent in SYN-SENT state, it is + reasonable for a user CLOSE call to immediately enter CLOSED + state and delete the TCB. + + Each of the new states in Figure 8 bears a starred name, created + by suffixing a star onto a standard TCP state. Each "starred" + state bears a simple relationship to the corresponding "unstarred" + state. + + o SYN-SENT* and SYN-RECEIVED* differ from the SYN-SENT and + SYN-RECEIVED state, respectively, in recording the fact that + a FIN needs to be sent. + + o The other starred states indicate that the connection is + half-synchronized (hence, a SYN bit needs to be sent). + + + + + + + + + + + + + +Braden [Page 19] + +RFC 1644 Transaction/TCP July 1994 + + + ________ g ________ + | |<------------| | + | CLOSED |------------>| LISTEN | + |________| h ------|________| + | / | | + | / i| j| + | / | | + a| a'/ | _V______ ________ + | / j | |ESTAB- | e' | CLOSE- | + | / -----------|-->| LISHED*|------------>| WAIT*| + | / / | |________| |________| + | / / | | | | | + | / / | | c| d'| c| + ____V_V_ / _______V | __V_____ | __V_____ + | SYN- | b' | SYN- |c | |ESTAB- | e | | CLOSE- | + | SENT |------>|RECEIVED|---|->| LISHED|----------|->| WAIT | + |________| |________| | |________| | |________| + | | | | | | + | | | | __V_____ | + | | | | | LAST- | | + d'| d'| d'| d| | ACK* | | + | | | | |________| | + | | | | | | + | | ______V_ | ________ |c' |d + | k | | FIN- | | e''' | | | | + | -------|-->| WAIT-1*|---|------>|CLOSING*| | | + | / | |________| | |________| | | + | / | | | | | | + | / | c'| | c'| | | + ___V___ / ____V___ V_____V_ ____V___ V____V__ + | SYN- | b'' | SYN- | c | FIN- | e'' | | | LAST- | + | SENT* |---->|RECEIVD*|---->| WAIT-1 |---->|CLOSING | | ACK | + |________| |________| |________| |________| |________| + | | | + f| f| f'| + ___V____ ____V___ ___V____ + | FIN- | e |TIME- | T | | + | WAIT-2 |---->| WAIT |-->| CLOSED | + |________| |________| |________| + + + Figure 8A: Basic T/TCP State Diagram + + + + + + + + + +Braden [Page 20] + +RFC 1644 Transaction/TCP July 1994 + + + ________________________________________________________________ + | | + | Label Event / Action | + | _____ ________________________ | + | | + | a Active OPEN / create TCB, snd SYN | + | a' Active OPEN / snd SYN | + | b rcv SYN [no TAO]/ snd ACK(SYN) | + | b' rcv SYN [no TAO]/ snd SYN,ACK(SYN) | + | b'' rcv SYN [no TAO]/ snd SYN,FIN,ACK(SYN) | + | c rcv ACK(SYN) / | + | c' rcv ACK(SYN) / snd FIN | + | d CLOSE / snd FIN | + | d' CLOSE / snd SYN,FIN | + | e rcv FIN / snd ACK(FIN) | + | e' rcv FIN / snd SYN,ACK(FIN) | + | e'' rcv FIN / snd FIN,ACK(FIN) | + | e''' rcv FIN / snd SYN,FIN,ACK(FIN) | + | f rcv ACK(FIN) / | + | f' rcv ACK(FIN) / delete TCB | + | g CLOSE / delete TCB | + | h passive OPEN / create TCB | + | i (= b') rcv SYN [no TAO]/ snd SYN,ACK(SYN) | + | j rcv SYN [TAO OK] / snd SYN,ACK(SYN) | + | k rcv SYN [TAO OK] / snd SYN,FIN,ACK(SYN) | + | T timeout=2MSL / delete TCB | + | | + | | + | Figure 8B. Definition of State Transitions | + |________________________________________________________________| + + This simple correspondence leads to an alternative state model, + which makes it easy to incorporate the new states in an existing + implementation. Each state in the extended FSM is defined by the + triplet: + + (old_state, SENDSYN, SENDFIN) + + where 'old_state' is a standard TCP state and SENDFIN and SENDSYN + are Boolean flags see Figure 9. The SENDFIN flag is turned on (on + the client side) by a SEND(... EOF=YES) call, to indicate that a + FIN should be sent in a state which would not otherwise send a + FIN. The SENDSYN flag is turned on when the TAO test succeeds to + indicate that the connection is only half synchronized; as a + result, a SYN will be sent in a state which would not otherwise + send a SYN. + + + + + +Braden [Page 21] + +RFC 1644 Transaction/TCP July 1994 + + + ________________________________________________________________ + | | + | New state: Old_state: SENDSYN: SENDFIN: | + | __________ __________ ______ ______ | + | | + | SYN-SENT* => SYN-SENT FALSE TRUE | + | | + | SYN-RECEIVED* => SYN-RECEIVED FALSE TRUE | + | | + | ESTABLISHED* => ESTABLISHED TRUE FALSE | + | | + | CLOSE-WAIT* => CLOSE-WAIT TRUE FALSE | + | | + | LAST-ACK* => LAST-ACK TRUE FALSE | + | | + | FIN-WAIT-1* => FIN-WAIT-1 TRUE FALSE | + | | + | CLOSING* => CLOSING TRUE FALSE | + | | + | | + | Figure 9: Alternative State Definitions | + |________________________________________________________________| + + + Here is a more complete description of these boolean variables. + + * SENDFIN + + SENDFIN is turned on by the SEND(...EOF=YES) call, and turned + off when FIN-WAIT-1 state is entered. It may only be on in + SYN-SENT* and SYN-RECEIVED* states. + + SENDFIN has two effects. First, it causes a FIN to be sent + on the last segment of data from the user. Second, it causes + the SYN-SENT[*] and SYN-RECEIVED[*] states to transition + directly to FIN-WAIT-1, skipping ESTABLISHED state. + + * SENDSYN + + The SENDSYN flag is turned on when an initial SYN segment is + received and passes the TAO test. SENDSYN is turned off when + the SYN is acknowledged (specifically, when there is no RST + or SYN bit and SEG.UNA < SND.ACK). + + SENDSYN has three effects. First, it causes the SYN bit to + be set in segments sent with the initial sequence number + (ISN). Second, it causes a transition directly from LISTEN + state to ESTABLISHED*, if there is no FIN bit, or otherwise + + + +Braden [Page 22] + +RFC 1644 Transaction/TCP July 1994 + + + to CLOSE-WAIT*. Finally, it allows data to be received and + processed (passed to the application) even if the segment + does not contain an ACK bit. + + According to the state model of the basic TCP specification [STD- + 007], the server side must explicitly issued a passive OPEN call, + creating a TCB in LISTEN state, before an initial SYN may be + accepted. To accommodate truncation of TIME-WAIT state within + this model, it is necessary to add the five "I-states" shown in + Figure 10. The I-states are: LISTEN-LA, LISTEN-LA*, LISTEN-CL, + LISTEN-CL*, and LISTEN-TW. These are 'bridge states' between two + successive the state diagrams of two successive incarnations. + Here D is the duration of the previous connection, i.e., the + elapsed time since the connection opened. The transitions labeled + with lower-case letters are taken from Figure 8. + + Fortunately, many TCP implementations have a different user + interface model, in which the use can issue a generic passive open + ("listen") call; thereafter, when a matching initial SYN arrives, + a new TCB in LISTEN state is automatically generated. With this + user model, the I-states of Figure 10 are unnecessary. + + For example, suppose an initial SYN segment arrives for a + connection that is in LAST-ACK state. If this segment carries a + CC option and if SEG.CC is greater than TCB.CCrecv in the existing + TCB, the "q" transition shown in Figure 10 can be made directly + from the LAST-ACK state. That is, the previous TCB is processed + as if an ACK(FIN) had arrived, causing the user to be notified of + a successful CLOSE and the TCB to be deleted. Then processing of + the new SYN segment is repeated, using a new TCB that is generated + automatically. The same principle can be used to avoid + implementing any of the I-states. + + + + + + + + + + + + + + + + + + + +Braden [Page 23] + +RFC 1644 Transaction/TCP July 1994 + + + ______________________________ +| P: Passive OPEN / | +| | +| Q: Rcv SYN, special TAO test | d'| d| +| (see text) / Delete TCB, | ________ ___V____ | +| create TCB, snd SYN | |LISTEN- | P | LAST- | | +| | | LA* |<-----| ACK* | | +| Q': (same as Q) if D < MSL | |________| |________| | +| | | | | | +| R: Rcv ACK(FIN) / Delete TCB,| Q| c'| c'| | +| create TCB | | | | | +| | | ___V____ V______V +| S': Active OPEN if D < MSL / | | |LISTEN- | P | LAST- | +| Delete TCB, create TCB, | | | LA |<-----| ACK | +| snd SYN. | | |________| |________| +|______________________________| | | | | + | Q| R| f| + ________ ________ | | | | + e''' | | P |LISTEN- | | | V V + ---->|CLOSING*|----->| CL* | | | LISTEN CLOSED + |________| |________| | | + | | Q| | | + c'| c'| V V V + | | ESTABLISHED* + ____V___ V_______ + e'' | | P |LISTEN- | + ---->|CLOSING |------>| CL | + |________| |________| + | R| Q| + f| V V + | LISTEN ESTABLISHED* + ____V___ _________ + e |TIME- | P | LISTEN- | + ---->| WAIT |------------->| TW | + |________| |_________| + / | | | | + S'/ T| T| Q'| |S' + | _____V_ h _____V__ | V + | | |-------->| | | SYN-SENT + | | CLOSED |<--------| LISTEN | | + | |________| ------|________| | + | | / | j| | + | a| a'/ i| V V + | | / | ESTABLISHED* + V V V V + SYN-SENT ... + + Figure 10: I-States for TIME-WAIT Truncation + + + +Braden [Page 24] + +RFC 1644 Transaction/TCP July 1994 + + + 3.4 T/TCP Processing Rules + + This section summarizes the rules for sending and processing the + T/TCP options. + + INITIALIZATION + + I1: All cache entries cache.CC[*] and cache.CCsent[*] are + undefined (zero) when a host system initializes, and CCgen + is set to a non-zero value. + + I2: A new TCB is initialized with TCB.CCrecv = 0 and + TCB.CCsend = current CCgen value; CCgen is then + incremented. If the result is zero, CCgen is incremented + again. + + + SENDING SEGMENTS + + S1: Sending initial Segment + + An initial segment is sent with either a CC option + or a CC.NEW option. If cache.CCsent[fh] is undefined or + if TCB.CCsend < cache.CCsent[fh], then the option + CC.NEW(TCB.CCsend) is sent and cache.CCsent[fh] is set to + zero. Otherwise, the option CC(TCB.CCsend) is sent and + cache.CCsent[fh] is set to CCsend. + + S2: Sending Segment + + If the sender's TCB.CCrecv is non-zero, then a + segment is sent with both a CC(TCB.CCsend) option and a + CC.ECHO (TCB.CCrecv) option. + + S3: Sending Non-SYN Segment + + A non-SYN segment is sent with a CC(TCB.CCsend) option if + the TCB.CCrecv value is non-zero, or if the state is SYN- + SENT or SYN-SENT* and cache.CCsent[fh] is non-zero (this + last is required to send CC options in the segments + following the first of a multi-segment request message; + see segment #2 in Figure 6). + + RECEIVING INITIAL SEGMENT + + Suppose that a server host receives a segment containing a SYN + bit but no ACK bit in LISTEN, SYN-SENT, or SYN-SENT* state. + + + + +Braden [Page 25] + +RFC 1644 Transaction/TCP July 1994 + + + R1.1:If the segment contains a CC or CC.NEW option, + SEG.CC is stored into TCB.CCrecv of the new TCB. + + R1.2:If the segment contains a CC option and if the local cache + entry cache.CC[fh] is defined and if + SEG.CC > cache.CC[fh], then the TAO test is passed and the + connection is half-synchronized in the incoming direction. + The server host replaces the cache.CC[fh] value by SEG.CC, + passes any data in the segment to the user, and processes + a FIN bit if present. + + Acknowledgment of the SYN is delayed to allow piggybacking + on a response segment. + + R1.3:If SEG.CC <= cache.CC[fh] (the TAO test has failed), or if + cache.CC[fh] is undefined, or if there is no CC option + (but possibly a CC.NEW option), the server host proceeds + with normal TCP processing. If the connection was in + LISTEN state, then the host executes a 3-way handshake + using the standard TCP rules. In the SYN-SENT or SYN- + SENT* state (i.e., the simultaneous open case), the TCP + sends ACK(SYN) and enters SYN-RECEIVED state. + + R1.4:If there is no CC option (but possibly a CC.NEW option), + then the server host sets cache.CC[fh] undefined (zero). + Receiving an ACK for a SYN (following application of rule + R1.3) will update cache.CC[fh], by rule R3. + + Suppose that an initial segment containing a CC or CC.NEW + option arrives in an I-state (i.e., a state with a name of the + form 'LISTEN-xx', where xx is one of TW, LA, L8, CL, or CL*): + + R1.5:If the state is LISTEN-TW, then the duration of the + current connection is compared with MSL. If duration > + MSL then send a RST: + + + + drop the packet, and return. + + R1.6:Perform a special TAO test: compare SEG.CC with + TCB.CCrecv. + + If SEG.CC is greater, then processing is performed as if + an ACK(FIN) had arrived: signal the application that the + previous close completed successfully and delete the + previous TCB. Then create a new TCB in LISTEN state and + reprocess the SYN segment against the new TCB. + + + +Braden [Page 26] + +RFC 1644 Transaction/TCP July 1994 + + + Otherwise, silently discard the segment. + + RECEIVING SEGMENT + + Suppose that a client host receives a segment for a + connection in SYN-SENT or SYN-SENT* state. + + R2.1:If SEG.ACK is not acceptable (see [STD-007]) and + cache.CCsent[fh] is non-zero, then simply drop the segment + without sending a RST. (The new SYN that the client is + (re-)transmitting will eventually acknowledge any + outstanding data and FIN at the server.) + + R2.2:If the segment contains a CC.ECHO option whose SEG.CC is + different from TCB.CCsend, then the segment is + unacceptable and is dropped. + + R2.3:If cache.CCsent[fh] is zero, then it is set to TCB.CCsend. + + R2.4:If the segment contains a CC option, its SEG.CC is stored + into TCB.CCrecv of the TCB. + + RECEIVING SEGMENT IN SYN-RECEIVED STATE + + R3.1:If a segment contains a CC option whose SEG.CC differs + from TCB.CCrecv, then the segment is unacceptable and is + dropped. + + R3.2:Otherwise, a 3-way handshake has completed successfully at + the server side. If the segment contains a CC option and + if cache.CC[fh] is zero, then cache.CC[fh] is replaced by + TCB.CCrecv. + + RECEIVING OTHER SEGMENT + + R4: Any other segment received with a CC option is + unacceptable if SEG.CC differs from TCB.CCrecv. However, + a RST segment is exempted from this test. + + OPEN REQUEST + + To allow truncation of TIME-WAIT state, the following changes + are made in the state diagram for OPEN requests (see Figure + 10): + + O1.1:A new passive open request is allowed in any of the + states: LAST-ACK, LAST-ACK*, CLOSING, CLOSING*, or TIME- + WAIT. This causes a transition to the corresponding I- + + + +Braden [Page 27] + +RFC 1644 Transaction/TCP July 1994 + + + state (see Figure 10), which retains the previous state, + including the retransmission queue and timer. + + O1.2 A new active open request is allowed in TIME-WAIT or + LISTEN-TW state, if the elapsed time since the current + connection opened is less than MSL. The result is to + delete the old TCB and create a new one, send a new SYN + segment, and enter SYN-SENT or SYN-SENT* state (depending + upon whether or not the SYN segment contains a FIN bit). + + Finally, T/TCP has a provision to improve performance for the case + of a client that "sprays" transactions rapidly using many + different server hosts and/or ports. If TCB.CCrecv in the TCB is + non-zero (and still assuming that the connection duration is less + than MSL), then the TIME-WAIT delay may be set to min(K*RTO, + 2*MSL). Here RTO is the measured retransmission timeout time and + the constant K is currently specified to be 8. + + 3.5 User Interface + + STD-007 defines a prototype user interface ("transport service") + that implements the virtual circuit service model [STD-007, + Section 3.8]. One addition to this interface in required for + transaction processing: a new Boolean flag "end-of-file" (EOF), + added to the SEND call. A generic SEND call becomes: + + Send + + Format: SEND (local connection name, buffer address, + byte count, PUSH flag, URGENT flag, EOF flag [,timeout]) + + The following text would be added to the description of SEND in + [STD-007]: + + If the EOF (End-Of-File) flag is set, any remaining queued + data is pushed and the connection is closed. Just as with the + CLOSE call, all data being sent is delivered reliably before + the close takes effect, and data may continue to be received + on the connection after completion of the SEND call. + + Figure 8A shows a skeleton sequence of user calls by which a + client could initiate a transaction. The SEND call initiates a + transaction request to the foreign socket (host and port) + specified in the passive OPEN call. The predicate "recv_EOF" + tests whether or not a FIN has been received on the connection; + this might be implemented using the STATUS command of [STD-007], + or it might be implemented by some operating-system-dependent + mechanism. When recv_EOF returns TRUE, the connection has been + + + +Braden [Page 28] + +RFC 1644 Transaction/TCP July 1994 + + + completely closed and the client end of the connection is in + TIME-WAIT state. + + __________________________________________________________________ + | | + | | + | OPEN(local_port, foreign_socket, PASSIVE) -> conn_name; | + | | + | SEND(conn_name, request_buffer, length, | + | PUSH=YES, URG=NO, EOF=YES); | + | | + | while (not recv_EOF(conn_name)) { | + | | + | RECEIVE(conn_name, reply_buffer, length) -> count; | + | | + | | + | } | + | | + | | + | Figure 8A: Client Side User Interface | + |__________________________________________________________________| + + If a client is going to send a rapid series of such requests to + the same foreign_socket, it should use the same local_port for + all. This will allow truncation of TIME-WAIT state. Otherwise, + it could leave local_port wild, allowing TCP to choose successive + local ports for each call, realizing that each transaction may + leave behind a significant control block overhead in the kernel. + + Figure 8B shows a basic sequence of server calls. The server + application waits for a request to arrive and then reads and + processes it until a FIN arrives (recv_EOF returns TRUE). At this + time, the connection is half-closed. The SEND call used to return + the reply completes the close in the other direction. It should + be noted that the use of SEND(... EOF=YES) in Figure 4B instead of + a SEND, CLOSE sequence is only an optimization; it allows + piggybacking the FIN in order to minimize the number of segments. + It should have little effect on transaction latency. + + + + + + + + + + + + + +Braden [Page 29] + +RFC 1644 Transaction/TCP July 1994 + + + __________________________________________________________________ + | | + | | + | OPEN(local_port, ANY_SOCKET, PASSIVE) -> conn_name; | + | | + | | + | | + | STATUS(conn_name) -> foreign_socket | + | | + | while (not recv_EOF(conn_name)) { | + | | + | RECEIVE(conn_name, request_buffer, length) -> count; | + | | + | | + | } | + | | + | | + | | + | SEND(conn_name, reply_buffer, length, | + | PUSH=YES, URG=NO, EOF=YES); | + | | + | | + | Figure 8B: Server Side User Interface | + |__________________________________________________________________| + + +4. IMPLEMENTATION ISSUES + + 4.1 RFC-1323 Extensions + + A recently-proposed set of TCP enhancements [RFC-1323] defines a + Timestamps option, which carries two 32-bit timestamp values. + This option is used to accurately measure round-trip time (RTT). + The same option is also used in a procedure known as "PAWS" + (Protect Against Wrapped Sequence) to prevent erroneous data + delivery due to a combination of old duplicate segments and + sequence number reuse at very high bandwidths. The approach to + transactions specified in this memo is independent of the RFC-1323 + enhancements, but implementation of RFC-1323 is desirable for all + TCP's. + + The RFC-1323 extensions share several common implementation issues + with the T/TCP extensions. Both require that TCP headers carry + options. Accommodating options in TCP headers requires changes in + the way that the maximum segment size is determined, to prevent + inadvertent IP fragmentation. Both require some additional state + variable in the TCB, which may or may not cause implementation + difficulties. + + + +Braden [Page 30] + +RFC 1644 Transaction/TCP July 1994 + + + 4.2 Minimal Packet Sequence + + Most TCP implementations will require some small modifications to + allow the minimal packet sequence for a transaction shown in + Figure 2. + + Many TCP implementations contain a mechanism to delay + acknowledgments of some subset of the data segments, to cut down + on the number of acknowledgment segments and to allow piggybacking + on the reverse data flow (typically character echoes). To obtain + minimal packet exchanges for transactions, it is necessary to + delay the acknowledgment of some control bits, in an analogous + manner. In particular, the segment that is to be sent + in ESTABLISHED* or CLOSE-WAIT* state should be delayed. Note that + the amount of delay is determined by the minimum RTO at the + transmitter; it is a parameter of the communication protocol, + independent of the application. We propose to use the same delay + parameter (and if possible, the same mechanism) that is used for + delaying data acknowledgments. + + To get the FIN piggy-backed on the reply data (segment #3 in + Figure 2), thos implementations that have an implied PUSH=YES on + all SEND calls will need to augment the user interface so that + PUSH=NO can be set for transactions. + + 4.3 RTT Measurement + + Transactions introduce new issues into the problem of measuring + round trip times [Jacobson88]. + + (a) With the minimal 3-segment exchange, there can be exactly one + RTT measurement in each direction for each transaction. + Since dynamic estimation of RTT cannot take place within a + single transaction, it must take place across successive + transactions. Therefore, cacheing the measured RTT and RTT + variance values is essential for transaction processing; in + normal virtual circuit communication, such cacheing is only + desirable. + + (b) At the completion of a transaction, the values for RTT and + RTT variance that are retained in the cache must be some + average of previous values with the values measured during + the transaction that is completing. This raises the question + of the time constant for this average; quite different + dynamic considerations hold for transactions than for file + transfers, for example. + + (c) An RTT measurement by the client will yield the value: + + + +Braden [Page 31] + +RFC 1644 Transaction/TCP July 1994 + + + T = RTT + min(SPT, ATO), + + where SPT (server processing time) was defined in the + introduction, and ATO is the timeout period for sending a + delayed ACK. Thus, the measured RTT includes SPT, which may + be arbitrarily variable; however, the resulting variability + of the measured T cannot exceed ATO. (In a popular TCP + implementation, for example, ATO = 200ms, so that the + variance of SPT makes a relatively small contribution to the + variance of RTT.) + + (d) Transactions sample the RTT at random times, which are + determined by the client and the server applications rather + than by the network dynamics. When there are long pauses + between transactions, cached path properties will be poor + predictors of current values in the network. + + Thus, the dynamics of RTT measurement for transactions differ from + those for virtual circuits. RTT measurements should work + correctly for very short connections but reduce to the current TCP + algorithms for long-lasting connections. Further study is this + issue is needed. + + 4.4 Cache Implementation + + This extension requires a per-host cache of connection counts. + This cache may also contain values of the smoothed RTT, RTT + variance, congestion avoidance threshold, and MSS values. + Depending upon the implementation details, it may be simplest to + build a new cache for these values; another possibility is to use + the routing cache that should already be included in the host + [RFC-1122]. + + Implementation of the cache may be simplified because it is + consulted only when a connection is established; thereafter, the + CC values relevant to the connection are kept in the TCB. This + means that a cache entry may be safely reused during the lifetime + of a connection, avoiding the need for locking. + + 4.5 CPU Performance + + TCP implementations are customarily optimized for streaming of + data at high speeds, not for opening or closing connections. + Jacobson's Header Prediction algorithm [Jacobson90] handles the + simple common cases of in-sequence data and ACK segments when + streaming data. To provide good performance for transactions, an + implementation might be able to do an analogous "header + prediction" specifically for the minimal request and the response + + + +Braden [Page 32] + +RFC 1644 Transaction/TCP July 1994 + + + segments. + + The overhead of UDP provides a lower bound on the overhead of + TCP-based transaction processing. It will probably not be + possible to reach this bound for TCP transactions, since opening a + TCP connection involves creating a significant amount of state + that is not required by UDP. + + McKenney and Dove [McKenney92] have pointed out that transaction + processing applications of TCP can stress the performance of the + demultiplexing algorithm, i.e., the algorithm used to look up the + TCB when a segment arrives. They advocate the use of hash-table + techniques rather than a linear search. The effect of + demultiplexing on performance may become especially acute for a + transaction client using the extended TCP described here, due to + TCB's left in TIME-WAIT state. A high rate of transactions from a + given client will leave a large number of TCB's in TIME-WAIT + state, until their timeout expires. If the TCP implementation + uses a linear search for demultiplexing, all of these control + blocks must be traversed in order to discover that the new + association does not exist. In this circumstance, performance of + a hash table lookup should not degrade severely due to + transactions. + + 4.6 Pre-SYN Queue + + Suppose that segment #1 in Figure 4 is lost in the network; when + segment #2 arrives in LISTEN state, it will be ignored by the TCP + rules (see [STD-007] p.66, "fourth other text and control"), and + must be retransmitted. It would be possible for the server side + to queue any ACK-less data segments received in LISTEN state and + to "replay" the segments in this queue when a SYN segment does + arrive. A data segment received with an ACK bit, which is the + normal case for existing TCP's, would still a generate RST + segment. + + Note that queueing segments in LISTEN state is different from + queueing out-of-order segments after the connection is + synchronized. In LISTEN state, the sequence number corresponding + to the left window edge is not yet known, so that the segment + cannot be trimmed to fit within the window before it is queued. + In fact, no processing should be done on a queued segment while + the connection is still in LISTEN state. Therefore, a new "pre- + SYN queue" would be needed. A timeout would be required, to flush + the Pre-SYN Queue in case a SYN segment was not received. + + Although implementation of a pre-SYN queue is not difficult in BSD + TCP, its limited contribution to throughput probably does not + + + +Braden [Page 33] + +RFC 1644 Transaction/TCP July 1994 + + + justify the effort. + +6. ACKNOWLEDGMENTS + + I am very grateful to Dave Clark for pointing out bugs in RFC-1379 + and for helping me to clarify the model. I also wish to thank Greg + Minshall, whose probing questions led to further elucidation of the + issues in T/TCP. + +7. REFERENCES + + [Jacobson88] Jacobson, V., "Congestion Avoidance and Control", ACM + SIGCOMM '88, Stanford, CA, August 1988. + + [Jacobson90] Jacobson, V., "4BSD Header Prediction", Comp Comm + Review, v. 20, no. 2, April 1990. + + [McKenney92] McKenney, P., and K. Dove, "Efficient Demultiplexing + of Incoming TCP Packets", ACM SIGCOMM '92, Baltimore, MD, October + 1992. + + [RFC-1122] Braden, R., Ed., "Requirements for Internet Hosts -- + Communications Layers", STD-3, RFC-1122, USC/Information Sciences + Institute, October 1989. + + [RFC-1323] Jacobson, V., Braden, R., and D. Borman, "TCP Extensions + for High Performance, RFC-1323, LBL, USC/Information Sciences + Institute, Cray Research, February 1991. + + [RFC-1379] Braden, R., "Transaction TCP -- Concepts", RFC-1379, + USC/Information Sciences Institute, September 1992. + + [ShankarLee93] Shankar, A. and D. Lee, "Modulo-N Incarnation + Numbers for Cache-Based Transport Protocols", Report CS-TR-3046/ + UIMACS-TR-93-24, University of Maryland, March 1993. + + [STD-007] Postel, J., "Transmission Control Protocol - DARPA + Internet Program Protocol Specification", STD-007, RFC-793, + USC/Information Sciences Institute, September 1981. + + + + + + + + + + + + +Braden [Page 34] + +RFC 1644 Transaction/TCP July 1994 + + +APPENDIX A. ALGORITHM SUMMARY + + This appendix summarizes the additional processing rules introduced + by T/TCP. We define the following symbols: + + Options + + CC(SEG.CC): TCP Connection Count (CC) Option + CC.NEW(SEG.CC): TCP CC.NEW option + CC.ECHO(SEG.CC): TCP CC.ECHO option + + Here SEG.CC is option value in segment. + + Per-Connection State Variables in TCB + + CCsend: CC value to be sent in segments + CCrecv: CC value to be received in segments + Elapsed: Duration of connection + + Global Variables: + + CCgen: CC generator variable + cache.CC[fh]: Cache entry: Last CC value received. + cache.CCsent[fh]: Cache entry: Last CC value sent. + + + PSEUDO-CODE SUMMARY: + + Passive OPEN => { + Create new TCB; + } + + Active OPEN => { + + CCrecv = 0; + CCsend = CCgen; + If (CCgen == 0xffffffff) then Set CCgen = 1; + else Set CCgen = CCgen + 1. + + } + + + Send initial {SYN} segment => { + + If (cache.CCsent[fh] == 0 OR CCsend < cache.CCsent[fh] ) then { + + Include CC.NEW(CCsend) option in segment; + Set cache.CCsent[fh] = 0; + + + +Braden [Page 35] + +RFC 1644 Transaction/TCP July 1994 + + + } + else { + + Include CC(CCsend) option in segment; + Set cache.CCsent[fh] = CCsend; + } + } + + + Send {SYN,ACK} segment => { + + If (CCrecv != 0) then + Include CC(CCsend), CC.ECHO(CCrecv) options in segment. + } + + + Receive {SYN} segment in LISTEN, SYN-SENT, or SYN-SENT* state => { + + If state == LISTEN then { + CCrecv = 0; + CCsend = CCgen; + If (CCgen == 0xffffffff) then Set CCgen = 1; + else Set CCgen = CCgen + 1. + } + + If (Segment contains CC option OR + Segment contains CC.NEW option) then + Set CCrecv = SEG.CC. + + if (Segment contains CC option AND + cache.CC[fh] != 0 AND + SEG.CC > cache.CC[fh] ) then { /* TAO Test OK */ + + Set cache.CC[fh] = CCrecv; + + + } + + + If (Segment does not contain CC option) then + Set cache.CC[fh] = 0; + + . + } + + Receive {SYN} segment in LISTEN-TW, LISTEN-LA, LISTEN-LA*, LISTEN-CL, + or LISTEN-CL* state => { + + + + +Braden [Page 36] + +RFC 1644 Transaction/TCP July 1994 + + + If ( (Segment contains CC option AND CCrecv != 0 ) then { + + If (state = LISTEN-TW AND Elapsed > MSL ) then + . + + if (SEG.CC > CCrecv ) then { + ; + ; + . + /* Expect to match new TCB + * in LISTEN state. + */ + } + } + else + . + } + + + Receive {SYN,ACK} segment => { + + if (Segment contains CC.ECHO option AND + SEG.CC != CCsend) then + . + + if (Segment contains CC option) then { + Set CCrecv = SEG.CC. + + if (cache.CC[fh] is undefined) then + Set cache.CC[fh] = CCrecv. + } + } + + + Send non-SYN segment => { + + if (CCrecv != 0 OR + (cache.CCsent[fh] != 0 AND + state is SYN-SENT or SYN-SENT*)) then + Include CC(CCsend) option in segment. + } + + + Receive non-SYN segment in SYN-RECEIVED state => { + + if (Segment contains CC option AND RST bit is off) { + if (SEG.CC != CCrecv) then + . + + if (cache.CC[fh] is undefined) then + Set cache.CC[fh] = CCrecv. + } + } + + + Receive non-SYN segment in (state >= ESTABLISHED) => { + + if (Segment contains CC option AND RST bit is off) { + if (SEG.CC != CCrecv) then + . + } + } + + +Security Considerations + + Security issues are not discussed in this memo. + +Author's Address + + Bob Braden + University of Southern California + Information Sciences Institute + 4676 Admiralty Way + Marina del Rey, CA 90292 + + Phone: (310) 822-1511 + EMail: Braden@ISI.EDU + + + + + + + + + + + + + + + + + + + +Braden [Page 38] + diff --git a/ext/picotcp/RFC/rfc1661.txt b/ext/picotcp/RFC/rfc1661.txt new file mode 100644 index 0000000..02112bd --- /dev/null +++ b/ext/picotcp/RFC/rfc1661.txt @@ -0,0 +1,2976 @@ + + + + + + +Network Working Group W. Simpson, Editor +Request for Comments: 1661 Daydreamer +STD: 51 July 1994 +Obsoletes: 1548 +Category: Standards Track + + + The Point-to-Point Protocol (PPP) + + + +Status of this Memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + + +Abstract + + The Point-to-Point Protocol (PPP) provides a standard method for + transporting multi-protocol datagrams over point-to-point links. PPP + is comprised of three main components: + + 1. A method for encapsulating multi-protocol datagrams. + + 2. A Link Control Protocol (LCP) for establishing, configuring, + and testing the data-link connection. + + 3. A family of Network Control Protocols (NCPs) for establishing + and configuring different network-layer protocols. + + This document defines the PPP organization and methodology, and the + PPP encapsulation, together with an extensible option negotiation + mechanism which is able to negotiate a rich assortment of + configuration parameters and provides additional management + functions. The PPP Link Control Protocol (LCP) is described in terms + of this mechanism. + + +Table of Contents + + + 1. Introduction .......................................... 1 + 1.1 Specification of Requirements ................... 2 + 1.2 Terminology ..................................... 3 + + 2. PPP Encapsulation ..................................... 4 + + +Simpson [Page i] + RFC 1661 Point-to-Point Protocol July 1994 + + + 3. PPP Link Operation .................................... 6 + 3.1 Overview ........................................ 6 + 3.2 Phase Diagram ................................... 6 + 3.3 Link Dead (physical-layer not ready) ............ 7 + 3.4 Link Establishment Phase ........................ 7 + 3.5 Authentication Phase ............................ 8 + 3.6 Network-Layer Protocol Phase .................... 8 + 3.7 Link Termination Phase .......................... 9 + + 4. The Option Negotiation Automaton ...................... 11 + 4.1 State Transition Table .......................... 12 + 4.2 States .......................................... 14 + 4.3 Events .......................................... 16 + 4.4 Actions ......................................... 21 + 4.5 Loop Avoidance .................................. 23 + 4.6 Counters and Timers ............................. 24 + + 5. LCP Packet Formats .................................... 26 + 5.1 Configure-Request ............................... 28 + 5.2 Configure-Ack ................................... 29 + 5.3 Configure-Nak ................................... 30 + 5.4 Configure-Reject ................................ 31 + 5.5 Terminate-Request and Terminate-Ack ............. 33 + 5.6 Code-Reject ..................................... 34 + 5.7 Protocol-Reject ................................. 35 + 5.8 Echo-Request and Echo-Reply ..................... 36 + 5.9 Discard-Request ................................. 37 + + 6. LCP Configuration Options ............................. 39 + 6.1 Maximum-Receive-Unit (MRU) ...................... 41 + 6.2 Authentication-Protocol ......................... 42 + 6.3 Quality-Protocol ................................ 43 + 6.4 Magic-Number .................................... 45 + 6.5 Protocol-Field-Compression (PFC) ................ 48 + 6.6 Address-and-Control-Field-Compression (ACFC) + + SECURITY CONSIDERATIONS ...................................... 51 + REFERENCES ................................................... 51 + ACKNOWLEDGEMENTS ............................................. 51 + CHAIR'S ADDRESS .............................................. 52 + EDITOR'S ADDRESS ............................................. 52 + + + + + + + + + + +Simpson [Page ii] + RFC 1661 Point-to-Point Protocol July 1994 + + +1. Introduction + + The Point-to-Point Protocol is designed for simple links which + transport packets between two peers. These links provide full-duplex + simultaneous bi-directional operation, and are assumed to deliver + packets in order. It is intended that PPP provide a common solution + for easy connection of a wide variety of hosts, bridges and routers + [1]. + + Encapsulation + + The PPP encapsulation provides for multiplexing of different + network-layer protocols simultaneously over the same link. The + PPP encapsulation has been carefully designed to retain + compatibility with most commonly used supporting hardware. + + Only 8 additional octets are necessary to form the encapsulation + when used within the default HDLC-like framing. In environments + where bandwidth is at a premium, the encapsulation and framing may + be shortened to 2 or 4 octets. + + To support high speed implementations, the default encapsulation + uses only simple fields, only one of which needs to be examined + for demultiplexing. The default header and information fields + fall on 32-bit boundaries, and the trailer may be padded to an + arbitrary boundary. + + Link Control Protocol + + In order to be sufficiently versatile to be portable to a wide + variety of environments, PPP provides a Link Control Protocol + (LCP). The LCP is used to automatically agree upon the + encapsulation format options, handle varying limits on sizes of + packets, detect a looped-back link and other common + misconfiguration errors, and terminate the link. Other optional + facilities provided are authentication of the identity of its peer + on the link, and determination when a link is functioning properly + and when it is failing. + + Network Control Protocols + + Point-to-Point links tend to exacerbate many problems with the + current family of network protocols. For instance, assignment and + management of IP addresses, which is a problem even in LAN + environments, is especially difficult over circuit-switched + point-to-point links (such as dial-up modem servers). These + problems are handled by a family of Network Control Protocols + (NCPs), which each manage the specific needs required by their + + + +Simpson [Page 1] + RFC 1661 Point-to-Point Protocol July 1994 + + + respective network-layer protocols. These NCPs are defined in + companion documents. + + Configuration + + It is intended that PPP links be easy to configure. By design, + the standard defaults handle all common configurations. The + implementor can specify improvements to the default configuration, + which are automatically communicated to the peer without operator + intervention. Finally, the operator may explicitly configure + options for the link which enable the link to operate in + environments where it would otherwise be impossible. + + This self-configuration is implemented through an extensible + option negotiation mechanism, wherein each end of the link + describes to the other its capabilities and requirements. + Although the option negotiation mechanism described in this + document is specified in terms of the Link Control Protocol (LCP), + the same facilities are designed to be used by other control + protocols, especially the family of NCPs. + + + +1.1. Specification of Requirements + + In this document, several words are used to signify the requirements + of the specification. These words are often capitalized. + + MUST This word, or the adjective "required", means that the + definition is an absolute requirement of the specification. + + MUST NOT This phrase means that the definition is an absolute + prohibition of the specification. + + SHOULD This word, or the adjective "recommended", means that there + may exist valid reasons in particular circumstances to + ignore this item, but the full implications must be + understood and carefully weighed before choosing a + different course. + + MAY This word, or the adjective "optional", means that this + item is one of an allowed set of alternatives. An + implementation which does not include this option MUST be + prepared to interoperate with another implementation which + does include the option. + + + + + + +Simpson [Page 2] + RFC 1661 Point-to-Point Protocol July 1994 + + +1.2. Terminology + + This document frequently uses the following terms: + + datagram The unit of transmission in the network layer (such as IP). + A datagram may be encapsulated in one or more packets + passed to the data link layer. + + frame The unit of transmission at the data link layer. A frame + may include a header and/or a trailer, along with some + number of units of data. + + packet The basic unit of encapsulation, which is passed across the + interface between the network layer and the data link + layer. A packet is usually mapped to a frame; the + exceptions are when data link layer fragmentation is being + performed, or when multiple packets are incorporated into a + single frame. + + peer The other end of the point-to-point link. + + silently discard + The implementation discards the packet without further + processing. The implementation SHOULD provide the + capability of logging the error, including the contents of + the silently discarded packet, and SHOULD record the event + in a statistics counter. + + + + + + + + + + + + + + + + + + + + + + + + +Simpson [Page 3] + RFC 1661 Point-to-Point Protocol July 1994 + + +2. PPP Encapsulation + + The PPP encapsulation is used to disambiguate multiprotocol + datagrams. This encapsulation requires framing to indicate the + beginning and end of the encapsulation. Methods of providing framing + are specified in companion documents. + + A summary of the PPP encapsulation is shown below. The fields are + transmitted from left to right. + + +----------+-------------+---------+ + | Protocol | Information | Padding | + | 8/16 bits| * | * | + +----------+-------------+---------+ + + + Protocol Field + + The Protocol field is one or two octets, and its value identifies + the datagram encapsulated in the Information field of the packet. + The field is transmitted and received most significant octet + first. + + The structure of this field is consistent with the ISO 3309 + extension mechanism for address fields. All Protocols MUST be + odd; the least significant bit of the least significant octet MUST + equal "1". Also, all Protocols MUST be assigned such that the + least significant bit of the most significant octet equals "0". + Frames received which don't comply with these rules MUST be + treated as having an unrecognized Protocol. + + Protocol field values in the "0***" to "3***" range identify the + network-layer protocol of specific packets, and values in the + "8***" to "b***" range identify packets belonging to the + associated Network Control Protocols (NCPs), if any. + + Protocol field values in the "4***" to "7***" range are used for + protocols with low volume traffic which have no associated NCP. + Protocol field values in the "c***" to "f***" range identify + packets as link-layer Control Protocols (such as LCP). + + + + + + + + + + + +Simpson [Page 4] + RFC 1661 Point-to-Point Protocol July 1994 + + + Up-to-date values of the Protocol field are specified in the most + recent "Assigned Numbers" RFC [2]. This specification reserves + the following values: + + Value (in hex) Protocol Name + + 0001 Padding Protocol + 0003 to 001f reserved (transparency inefficient) + 007d reserved (Control Escape) + 00cf reserved (PPP NLPID) + 00ff reserved (compression inefficient) + + 8001 to 801f unused + 807d unused + 80cf unused + 80ff unused + + c021 Link Control Protocol + c023 Password Authentication Protocol + c025 Link Quality Report + c223 Challenge Handshake Authentication Protocol + + Developers of new protocols MUST obtain a number from the Internet + Assigned Numbers Authority (IANA), at IANA@isi.edu. + + + Information Field + + The Information field is zero or more octets. The Information + field contains the datagram for the protocol specified in the + Protocol field. + + The maximum length for the Information field, including Padding, + but not including the Protocol field, is termed the Maximum + Receive Unit (MRU), which defaults to 1500 octets. By + negotiation, consenting PPP implementations may use other values + for the MRU. + + + Padding + + On transmission, the Information field MAY be padded with an + arbitrary number of octets up to the MRU. It is the + responsibility of each protocol to distinguish padding octets from + real information. + + + + + + +Simpson [Page 5] + RFC 1661 Point-to-Point Protocol July 1994 + + +3. PPP Link Operation + +3.1. Overview + + In order to establish communications over a point-to-point link, each + end of the PPP link MUST first send LCP packets to configure and test + the data link. After the link has been established, the peer MAY be + authenticated. + + Then, PPP MUST send NCP packets to choose and configure one or more + network-layer protocols. Once each of the chosen network-layer + protocols has been configured, datagrams from each network-layer + protocol can be sent over the link. + + The link will remain configured for communications until explicit LCP + or NCP packets close the link down, or until some external event + occurs (an inactivity timer expires or network administrator + intervention). + + + +3.2. Phase Diagram + + In the process of configuring, maintaining and terminating the + point-to-point link, the PPP link goes through several distinct + phases which are specified in the following simplified state diagram: + + +------+ +-----------+ +--------------+ + | | UP | | OPENED | | SUCCESS/NONE + | Dead |------->| Establish |---------->| Authenticate |--+ + | | | | | | | + +------+ +-----------+ +--------------+ | + ^ | | | + | FAIL | FAIL | | + +<--------------+ +----------+ | + | | | + | +-----------+ | +---------+ | + | DOWN | | | CLOSING | | | + +------------| Terminate |<---+<----------| Network |<-+ + | | | | + +-----------+ +---------+ + + Not all transitions are specified in this diagram. The following + semantics MUST be followed. + + + + + + + +Simpson [Page 6] + RFC 1661 Point-to-Point Protocol July 1994 + + +3.3. Link Dead (physical-layer not ready) + + The link necessarily begins and ends with this phase. When an + external event (such as carrier detection or network administrator + configuration) indicates that the physical-layer is ready to be used, + PPP will proceed to the Link Establishment phase. + + During this phase, the LCP automaton (described later) will be in the + Initial or Starting states. The transition to the Link Establishment + phase will signal an Up event to the LCP automaton. + + Implementation Note: + + Typically, a link will return to this phase automatically after + the disconnection of a modem. In the case of a hard-wired link, + this phase may be extremely short -- merely long enough to detect + the presence of the device. + + + +3.4. Link Establishment Phase + + The Link Control Protocol (LCP) is used to establish the connection + through an exchange of Configure packets. This exchange is complete, + and the LCP Opened state entered, once a Configure-Ack packet + (described later) has been both sent and received. + + All Configuration Options are assumed to be at default values unless + altered by the configuration exchange. See the chapter on LCP + Configuration Options for further discussion. + + It is important to note that only Configuration Options which are + independent of particular network-layer protocols are configured by + LCP. Configuration of individual network-layer protocols is handled + by separate Network Control Protocols (NCPs) during the Network-Layer + Protocol phase. + + Any non-LCP packets received during this phase MUST be silently + discarded. + + The receipt of the LCP Configure-Request causes a return to the Link + Establishment phase from the Network-Layer Protocol phase or + Authentication phase. + + + + + + + + +Simpson [Page 7] + RFC 1661 Point-to-Point Protocol July 1994 + + +3.5. Authentication Phase + + On some links it may be desirable to require a peer to authenticate + itself before allowing network-layer protocol packets to be + exchanged. + + By default, authentication is not mandatory. If an implementation + desires that the peer authenticate with some specific authentication + protocol, then it MUST request the use of that authentication + protocol during Link Establishment phase. + + Authentication SHOULD take place as soon as possible after link + establishment. However, link quality determination MAY occur + concurrently. An implementation MUST NOT allow the exchange of link + quality determination packets to delay authentication indefinitely. + + Advancement from the Authentication phase to the Network-Layer + Protocol phase MUST NOT occur until authentication has completed. If + authentication fails, the authenticator SHOULD proceed instead to the + Link Termination phase. + + Only Link Control Protocol, authentication protocol, and link quality + monitoring packets are allowed during this phase. All other packets + received during this phase MUST be silently discarded. + + Implementation Notes: + + An implementation SHOULD NOT fail authentication simply due to + timeout or lack of response. The authentication SHOULD allow some + method of retransmission, and proceed to the Link Termination + phase only after a number of authentication attempts has been + exceeded. + + The implementation responsible for commencing Link Termination + phase is the implementation which has refused authentication to + its peer. + + + +3.6. Network-Layer Protocol Phase + + Once PPP has finished the previous phases, each network-layer + protocol (such as IP, IPX, or AppleTalk) MUST be separately + configured by the appropriate Network Control Protocol (NCP). + + Each NCP MAY be Opened and Closed at any time. + + + + + +Simpson [Page 8] + RFC 1661 Point-to-Point Protocol July 1994 + + + Implementation Note: + + Because an implementation may initially use a significant amount + of time for link quality determination, implementations SHOULD + avoid fixed timeouts when waiting for their peers to configure a + NCP. + + After a NCP has reached the Opened state, PPP will carry the + corresponding network-layer protocol packets. Any supported + network-layer protocol packets received when the corresponding NCP is + not in the Opened state MUST be silently discarded. + + Implementation Note: + + While LCP is in the Opened state, any protocol packet which is + unsupported by the implementation MUST be returned in a Protocol- + Reject (described later). Only protocols which are supported are + silently discarded. + + During this phase, link traffic consists of any possible combination + of LCP, NCP, and network-layer protocol packets. + + + +3.7. Link Termination Phase + + PPP can terminate the link at any time. This might happen because of + the loss of carrier, authentication failure, link quality failure, + the expiration of an idle-period timer, or the administrative closing + of the link. + + LCP is used to close the link through an exchange of Terminate + packets. When the link is closing, PPP informs the network-layer + protocols so that they may take appropriate action. + + After the exchange of Terminate packets, the implementation SHOULD + signal the physical-layer to disconnect in order to enforce the + termination of the link, particularly in the case of an + authentication failure. The sender of the Terminate-Request SHOULD + disconnect after receiving a Terminate-Ack, or after the Restart + counter expires. The receiver of a Terminate-Request SHOULD wait for + the peer to disconnect, and MUST NOT disconnect until at least one + Restart time has passed after sending a Terminate-Ack. PPP SHOULD + proceed to the Link Dead phase. + + Any non-LCP packets received during this phase MUST be silently + discarded. + + + + +Simpson [Page 9] + RFC 1661 Point-to-Point Protocol July 1994 + + + Implementation Note: + + The closing of the link by LCP is sufficient. There is no need + for each NCP to send a flurry of Terminate packets. Conversely, + the fact that one NCP has Closed is not sufficient reason to cause + the termination of the PPP link, even if that NCP was the only NCP + currently in the Opened state. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Simpson [Page 10] + RFC 1661 Point-to-Point Protocol July 1994 + + +4. The Option Negotiation Automaton + + The finite-state automaton is defined by events, actions and state + transitions. Events include reception of external commands such as + Open and Close, expiration of the Restart timer, and reception of + packets from a peer. Actions include the starting of the Restart + timer and transmission of packets to the peer. + + Some types of packets -- Configure-Naks and Configure-Rejects, or + Code-Rejects and Protocol-Rejects, or Echo-Requests, Echo-Replies and + Discard-Requests -- are not differentiated in the automaton + descriptions. As will be described later, these packets do indeed + serve different functions. However, they always cause the same + transitions. + + Events Actions + + Up = lower layer is Up tlu = This-Layer-Up + Down = lower layer is Down tld = This-Layer-Down + Open = administrative Open tls = This-Layer-Started + Close= administrative Close tlf = This-Layer-Finished + + TO+ = Timeout with counter > 0 irc = Initialize-Restart-Count + TO- = Timeout with counter expired zrc = Zero-Restart-Count + + RCR+ = Receive-Configure-Request (Good) scr = Send-Configure-Request + RCR- = Receive-Configure-Request (Bad) + RCA = Receive-Configure-Ack sca = Send-Configure-Ack + RCN = Receive-Configure-Nak/Rej scn = Send-Configure-Nak/Rej + + RTR = Receive-Terminate-Request str = Send-Terminate-Request + RTA = Receive-Terminate-Ack sta = Send-Terminate-Ack + + RUC = Receive-Unknown-Code scj = Send-Code-Reject + RXJ+ = Receive-Code-Reject (permitted) + or Receive-Protocol-Reject + RXJ- = Receive-Code-Reject (catastrophic) + or Receive-Protocol-Reject + RXR = Receive-Echo-Request ser = Send-Echo-Reply + or Receive-Echo-Reply + or Receive-Discard-Request + + + + + + + + + + +Simpson [Page 11] + RFC 1661 Point-to-Point Protocol July 1994 + + +4.1. State Transition Table + + The complete state transition table follows. States are indicated + horizontally, and events are read vertically. State transitions and + actions are represented in the form action/new-state. Multiple + actions are separated by commas, and may continue on succeeding lines + as space requires; multiple actions may be implemented in any + convenient order. The state may be followed by a letter, which + indicates an explanatory footnote. The dash ('-') indicates an + illegal transition. + + | State + | 0 1 2 3 4 5 +Events| Initial Starting Closed Stopped Closing Stopping +------+----------------------------------------------------------- + Up | 2 irc,scr/6 - - - - + Down | - - 0 tls/1 0 1 + Open | tls/1 1 irc,scr/6 3r 5r 5r + Close| 0 tlf/0 2 2 4 4 + | + TO+ | - - - - str/4 str/5 + TO- | - - - - tlf/2 tlf/3 + | + RCR+ | - - sta/2 irc,scr,sca/8 4 5 + RCR- | - - sta/2 irc,scr,scn/6 4 5 + RCA | - - sta/2 sta/3 4 5 + RCN | - - sta/2 sta/3 4 5 + | + RTR | - - sta/2 sta/3 sta/4 sta/5 + RTA | - - 2 3 tlf/2 tlf/3 + | + RUC | - - scj/2 scj/3 scj/4 scj/5 + RXJ+ | - - 2 3 4 5 + RXJ- | - - tlf/2 tlf/3 tlf/2 tlf/3 + | + RXR | - - 2 3 4 5 + + + + + + + + + + + + + + + +Simpson [Page 12] + RFC 1661 Point-to-Point Protocol July 1994 + + + + | State + | 6 7 8 9 +Events| Req-Sent Ack-Rcvd Ack-Sent Opened +------+----------------------------------------- + Up | - - - - + Down | 1 1 1 tld/1 + Open | 6 7 8 9r + Close|irc,str/4 irc,str/4 irc,str/4 tld,irc,str/4 + | + TO+ | scr/6 scr/6 scr/8 - + TO- | tlf/3p tlf/3p tlf/3p - + | + RCR+ | sca/8 sca,tlu/9 sca/8 tld,scr,sca/8 + RCR- | scn/6 scn/7 scn/6 tld,scr,scn/6 + RCA | irc/7 scr/6x irc,tlu/9 tld,scr/6x + RCN |irc,scr/6 scr/6x irc,scr/8 tld,scr/6x + | + RTR | sta/6 sta/6 sta/6 tld,zrc,sta/5 + RTA | 6 6 8 tld,scr/6 + | + RUC | scj/6 scj/7 scj/8 scj/9 + RXJ+ | 6 6 8 9 + RXJ- | tlf/3 tlf/3 tlf/3 tld,irc,str/5 + | + RXR | 6 7 8 ser/9 + + + The states in which the Restart timer is running are identifiable by + the presence of TO events. Only the Send-Configure-Request, Send- + Terminate-Request and Zero-Restart-Count actions start or re-start + the Restart timer. The Restart timer is stopped when transitioning + from any state where the timer is running to a state where the timer + is not running. + + The events and actions are defined according to a message passing + architecture, rather than a signalling architecture. If an action is + desired to control specific signals (such as DTR), additional actions + are likely to be required. + + [p] Passive option; see Stopped state discussion. + + [r] Restart option; see Open event discussion. + + [x] Crossed connection; see RCA event discussion. + + + + + + +Simpson [Page 13] + RFC 1661 Point-to-Point Protocol July 1994 + + +4.2. States + + Following is a more detailed description of each automaton state. + + Initial + + In the Initial state, the lower layer is unavailable (Down), and + no Open has occurred. The Restart timer is not running in the + Initial state. + + Starting + + The Starting state is the Open counterpart to the Initial state. + An administrative Open has been initiated, but the lower layer is + still unavailable (Down). The Restart timer is not running in the + Starting state. + + When the lower layer becomes available (Up), a Configure-Request + is sent. + + Closed + + In the Closed state, the link is available (Up), but no Open has + occurred. The Restart timer is not running in the Closed state. + + Upon reception of Configure-Request packets, a Terminate-Ack is + sent. Terminate-Acks are silently discarded to avoid creating a + loop. + + Stopped + + The Stopped state is the Open counterpart to the Closed state. It + is entered when the automaton is waiting for a Down event after + the This-Layer-Finished action, or after sending a Terminate-Ack. + The Restart timer is not running in the Stopped state. + + Upon reception of Configure-Request packets, an appropriate + response is sent. Upon reception of other packets, a Terminate- + Ack is sent. Terminate-Acks are silently discarded to avoid + creating a loop. + + Rationale: + + The Stopped state is a junction state for link termination, + link configuration failure, and other automaton failure modes. + These potentially separate states have been combined. + + There is a race condition between the Down event response (from + + + +Simpson [Page 14] + RFC 1661 Point-to-Point Protocol July 1994 + + + the This-Layer-Finished action) and the Receive-Configure- + Request event. When a Configure-Request arrives before the + Down event, the Down event will supercede by returning the + automaton to the Starting state. This prevents attack by + repetition. + + Implementation Option: + + After the peer fails to respond to Configure-Requests, an + implementation MAY wait passively for the peer to send + Configure-Requests. In this case, the This-Layer-Finished + action is not used for the TO- event in states Req-Sent, Ack- + Rcvd and Ack-Sent. + + This option is useful for dedicated circuits, or circuits which + have no status signals available, but SHOULD NOT be used for + switched circuits. + + Closing + + In the Closing state, an attempt is made to terminate the + connection. A Terminate-Request has been sent and the Restart + timer is running, but a Terminate-Ack has not yet been received. + + Upon reception of a Terminate-Ack, the Closed state is entered. + Upon the expiration of the Restart timer, a new Terminate-Request + is transmitted, and the Restart timer is restarted. After the + Restart timer has expired Max-Terminate times, the Closed state is + entered. + + Stopping + + The Stopping state is the Open counterpart to the Closing state. + A Terminate-Request has been sent and the Restart timer is + running, but a Terminate-Ack has not yet been received. + + Rationale: + + The Stopping state provides a well defined opportunity to + terminate a link before allowing new traffic. After the link + has terminated, a new configuration may occur via the Stopped + or Starting states. + + Request-Sent + + In the Request-Sent state an attempt is made to configure the + connection. A Configure-Request has been sent and the Restart + timer is running, but a Configure-Ack has not yet been received + + + +Simpson [Page 15] + RFC 1661 Point-to-Point Protocol July 1994 + + + nor has one been sent. + + Ack-Received + + In the Ack-Received state, a Configure-Request has been sent and a + Configure-Ack has been received. The Restart timer is still + running, since a Configure-Ack has not yet been sent. + + Ack-Sent + + In the Ack-Sent state, a Configure-Request and a Configure-Ack + have both been sent, but a Configure-Ack has not yet been + received. The Restart timer is running, since a Configure-Ack has + not yet been received. + + Opened + + In the Opened state, a Configure-Ack has been both sent and + received. The Restart timer is not running. + + When entering the Opened state, the implementation SHOULD signal + the upper layers that it is now Up. Conversely, when leaving the + Opened state, the implementation SHOULD signal the upper layers + that it is now Down. + + + +4.3. Events + + Transitions and actions in the automaton are caused by events. + + Up + + This event occurs when a lower layer indicates that it is ready to + carry packets. + + Typically, this event is used by a modem handling or calling + process, or by some other coupling of the PPP link to the physical + media, to signal LCP that the link is entering Link Establishment + phase. + + It also can be used by LCP to signal each NCP that the link is + entering Network-Layer Protocol phase. That is, the This-Layer-Up + action from LCP triggers the Up event in the NCP. + + Down + + This event occurs when a lower layer indicates that it is no + + + +Simpson [Page 16] + RFC 1661 Point-to-Point Protocol July 1994 + + + longer ready to carry packets. + + Typically, this event is used by a modem handling or calling + process, or by some other coupling of the PPP link to the physical + media, to signal LCP that the link is entering Link Dead phase. + + It also can be used by LCP to signal each NCP that the link is + leaving Network-Layer Protocol phase. That is, the This-Layer- + Down action from LCP triggers the Down event in the NCP. + + Open + + This event indicates that the link is administratively available + for traffic; that is, the network administrator (human or program) + has indicated that the link is allowed to be Opened. When this + event occurs, and the link is not in the Opened state, the + automaton attempts to send configuration packets to the peer. + + If the automaton is not able to begin configuration (the lower + layer is Down, or a previous Close event has not completed), the + establishment of the link is automatically delayed. + + When a Terminate-Request is received, or other events occur which + cause the link to become unavailable, the automaton will progress + to a state where the link is ready to re-open. No additional + administrative intervention is necessary. + + Implementation Option: + + Experience has shown that users will execute an additional Open + command when they want to renegotiate the link. This might + indicate that new values are to be negotiated. + + Since this is not the meaning of the Open event, it is + suggested that when an Open user command is executed in the + Opened, Closing, Stopping, or Stopped states, the + implementation issue a Down event, immediately followed by an + Up event. Care must be taken that an intervening Down event + cannot occur from another source. + + The Down followed by an Up will cause an orderly renegotiation + of the link, by progressing through the Starting to the + Request-Sent state. This will cause the renegotiation of the + link, without any harmful side effects. + + Close + + This event indicates that the link is not available for traffic; + + + +Simpson [Page 17] + RFC 1661 Point-to-Point Protocol July 1994 + + + that is, the network administrator (human or program) has + indicated that the link is not allowed to be Opened. When this + event occurs, and the link is not in the Closed state, the + automaton attempts to terminate the connection. Futher attempts + to re-configure the link are denied until a new Open event occurs. + + Implementation Note: + + When authentication fails, the link SHOULD be terminated, to + prevent attack by repetition and denial of service to other + users. Since the link is administratively available (by + definition), this can be accomplished by simulating a Close + event to the LCP, immediately followed by an Open event. Care + must be taken that an intervening Close event cannot occur from + another source. + + The Close followed by an Open will cause an orderly termination + of the link, by progressing through the Closing to the Stopping + state, and the This-Layer-Finished action can disconnect the + link. The automaton waits in the Stopped or Starting states + for the next connection attempt. + + Timeout (TO+,TO-) + + This event indicates the expiration of the Restart timer. The + Restart timer is used to time responses to Configure-Request and + Terminate-Request packets. + + The TO+ event indicates that the Restart counter continues to be + greater than zero, which triggers the corresponding Configure- + Request or Terminate-Request packet to be retransmitted. + + The TO- event indicates that the Restart counter is not greater + than zero, and no more packets need to be retransmitted. + + Receive-Configure-Request (RCR+,RCR-) + + This event occurs when a Configure-Request packet is received from + the peer. The Configure-Request packet indicates the desire to + open a connection and may specify Configuration Options. The + Configure-Request packet is more fully described in a later + section. + + The RCR+ event indicates that the Configure-Request was + acceptable, and triggers the transmission of a corresponding + Configure-Ack. + + The RCR- event indicates that the Configure-Request was + + + +Simpson [Page 18] + RFC 1661 Point-to-Point Protocol July 1994 + + + unacceptable, and triggers the transmission of a corresponding + Configure-Nak or Configure-Reject. + + Implementation Note: + + These events may occur on a connection which is already in the + Opened state. The implementation MUST be prepared to + immediately renegotiate the Configuration Options. + + Receive-Configure-Ack (RCA) + + This event occurs when a valid Configure-Ack packet is received + from the peer. The Configure-Ack packet is a positive response to + a Configure-Request packet. An out of sequence or otherwise + invalid packet is silently discarded. + + Implementation Note: + + Since the correct packet has already been received before + reaching the Ack-Rcvd or Opened states, it is extremely + unlikely that another such packet will arrive. As specified, + all invalid Ack/Nak/Rej packets are silently discarded, and do + not affect the transitions of the automaton. + + However, it is not impossible that a correctly formed packet + will arrive through a coincidentally-timed cross-connection. + It is more likely to be the result of an implementation error. + At the very least, this occurance SHOULD be logged. + + Receive-Configure-Nak/Rej (RCN) + + This event occurs when a valid Configure-Nak or Configure-Reject + packet is received from the peer. The Configure-Nak and + Configure-Reject packets are negative responses to a Configure- + Request packet. An out of sequence or otherwise invalid packet is + silently discarded. + + Implementation Note: + + Although the Configure-Nak and Configure-Reject cause the same + state transition in the automaton, these packets have + significantly different effects on the Configuration Options + sent in the resulting Configure-Request packet. + + Receive-Terminate-Request (RTR) + + This event occurs when a Terminate-Request packet is received. + The Terminate-Request packet indicates the desire of the peer to + + + +Simpson [Page 19] + RFC 1661 Point-to-Point Protocol July 1994 + + + close the connection. + + Implementation Note: + + This event is not identical to the Close event (see above), and + does not override the Open commands of the local network + administrator. The implementation MUST be prepared to receive + a new Configure-Request without network administrator + intervention. + + Receive-Terminate-Ack (RTA) + + This event occurs when a Terminate-Ack packet is received from the + peer. The Terminate-Ack packet is usually a response to a + Terminate-Request packet. The Terminate-Ack packet may also + indicate that the peer is in Closed or Stopped states, and serves + to re-synchronize the link configuration. + + Receive-Unknown-Code (RUC) + + This event occurs when an un-interpretable packet is received from + the peer. A Code-Reject packet is sent in response. + + Receive-Code-Reject, Receive-Protocol-Reject (RXJ+,RXJ-) + + This event occurs when a Code-Reject or a Protocol-Reject packet + is received from the peer. + + The RXJ+ event arises when the rejected value is acceptable, such + as a Code-Reject of an extended code, or a Protocol-Reject of a + NCP. These are within the scope of normal operation. The + implementation MUST stop sending the offending packet type. + + The RXJ- event arises when the rejected value is catastrophic, + such as a Code-Reject of Configure-Request, or a Protocol-Reject + of LCP! This event communicates an unrecoverable error that + terminates the connection. + + Receive-Echo-Request, Receive-Echo-Reply, Receive-Discard-Request + (RXR) + + This event occurs when an Echo-Request, Echo-Reply or Discard- + Request packet is received from the peer. The Echo-Reply packet + is a response to an Echo-Request packet. There is no reply to an + Echo-Reply or Discard-Request packet. + + + + + + +Simpson [Page 20] + RFC 1661 Point-to-Point Protocol July 1994 + + +4.4. Actions + + Actions in the automaton are caused by events and typically indicate + the transmission of packets and/or the starting or stopping of the + Restart timer. + + Illegal-Event (-) + + This indicates an event that cannot occur in a properly + implemented automaton. The implementation has an internal error, + which should be reported and logged. No transition is taken, and + the implementation SHOULD NOT reset or freeze. + + This-Layer-Up (tlu) + + This action indicates to the upper layers that the automaton is + entering the Opened state. + + Typically, this action is used by the LCP to signal the Up event + to a NCP, Authentication Protocol, or Link Quality Protocol, or + MAY be used by a NCP to indicate that the link is available for + its network layer traffic. + + This-Layer-Down (tld) + + This action indicates to the upper layers that the automaton is + leaving the Opened state. + + Typically, this action is used by the LCP to signal the Down event + to a NCP, Authentication Protocol, or Link Quality Protocol, or + MAY be used by a NCP to indicate that the link is no longer + available for its network layer traffic. + + This-Layer-Started (tls) + + This action indicates to the lower layers that the automaton is + entering the Starting state, and the lower layer is needed for the + link. The lower layer SHOULD respond with an Up event when the + lower layer is available. + + This results of this action are highly implementation dependent. + + This-Layer-Finished (tlf) + + This action indicates to the lower layers that the automaton is + entering the Initial, Closed or Stopped states, and the lower + layer is no longer needed for the link. The lower layer SHOULD + respond with a Down event when the lower layer has terminated. + + + +Simpson [Page 21] + RFC 1661 Point-to-Point Protocol July 1994 + + + Typically, this action MAY be used by the LCP to advance to the + Link Dead phase, or MAY be used by a NCP to indicate to the LCP + that the link may terminate when there are no other NCPs open. + + This results of this action are highly implementation dependent. + + Initialize-Restart-Count (irc) + + This action sets the Restart counter to the appropriate value + (Max-Terminate or Max-Configure). The counter is decremented for + each transmission, including the first. + + Implementation Note: + + In addition to setting the Restart counter, the implementation + MUST set the timeout period to the initial value when Restart + timer backoff is used. + + Zero-Restart-Count (zrc) + + This action sets the Restart counter to zero. + + Implementation Note: + + This action enables the FSA to pause before proceeding to the + desired final state, allowing traffic to be processed by the + peer. In addition to zeroing the Restart counter, the + implementation MUST set the timeout period to an appropriate + value. + + Send-Configure-Request (scr) + + A Configure-Request packet is transmitted. This indicates the + desire to open a connection with a specified set of Configuration + Options. The Restart timer is started when the Configure-Request + packet is transmitted, to guard against packet loss. The Restart + counter is decremented each time a Configure-Request is sent. + + Send-Configure-Ack (sca) + + A Configure-Ack packet is transmitted. This acknowledges the + reception of a Configure-Request packet with an acceptable set of + Configuration Options. + + Send-Configure-Nak (scn) + + A Configure-Nak or Configure-Reject packet is transmitted, as + appropriate. This negative response reports the reception of a + + + +Simpson [Page 22] + RFC 1661 Point-to-Point Protocol July 1994 + + + Configure-Request packet with an unacceptable set of Configuration + Options. + + Configure-Nak packets are used to refuse a Configuration Option + value, and to suggest a new, acceptable value. Configure-Reject + packets are used to refuse all negotiation about a Configuration + Option, typically because it is not recognized or implemented. + The use of Configure-Nak versus Configure-Reject is more fully + described in the chapter on LCP Packet Formats. + + Send-Terminate-Request (str) + + A Terminate-Request packet is transmitted. This indicates the + desire to close a connection. The Restart timer is started when + the Terminate-Request packet is transmitted, to guard against + packet loss. The Restart counter is decremented each time a + Terminate-Request is sent. + + Send-Terminate-Ack (sta) + + A Terminate-Ack packet is transmitted. This acknowledges the + reception of a Terminate-Request packet or otherwise serves to + synchronize the automatons. + + Send-Code-Reject (scj) + + A Code-Reject packet is transmitted. This indicates the reception + of an unknown type of packet. + + Send-Echo-Reply (ser) + + An Echo-Reply packet is transmitted. This acknowledges the + reception of an Echo-Request packet. + + + +4.5. Loop Avoidance + + The protocol makes a reasonable attempt at avoiding Configuration + Option negotiation loops. However, the protocol does NOT guarantee + that loops will not happen. As with any negotiation, it is possible + to configure two PPP implementations with conflicting policies that + will never converge. It is also possible to configure policies which + do converge, but which take significant time to do so. Implementors + should keep this in mind and SHOULD implement loop detection + mechanisms or higher level timeouts. + + + + + +Simpson [Page 23] + RFC 1661 Point-to-Point Protocol July 1994 + + +4.6. Counters and Timers + + Restart Timer + + There is one special timer used by the automaton. The Restart + timer is used to time transmissions of Configure-Request and + Terminate-Request packets. Expiration of the Restart timer causes + a Timeout event, and retransmission of the corresponding + Configure-Request or Terminate-Request packet. The Restart timer + MUST be configurable, but SHOULD default to three (3) seconds. + + Implementation Note: + + The Restart timer SHOULD be based on the speed of the link. + The default value is designed for low speed (2,400 to 9,600 + bps), high switching latency links (typical telephone lines). + Higher speed links, or links with low switching latency, SHOULD + have correspondingly faster retransmission times. + + Instead of a constant value, the Restart timer MAY begin at an + initial small value and increase to the configured final value. + Each successive value less than the final value SHOULD be at + least twice the previous value. The initial value SHOULD be + large enough to account for the size of the packets, twice the + round trip time for transmission at the link speed, and at + least an additional 100 milliseconds to allow the peer to + process the packets before responding. Some circuits add + another 200 milliseconds of satellite delay. Round trip times + for modems operating at 14,400 bps have been measured in the + range of 160 to more than 600 milliseconds. + + Max-Terminate + + There is one required restart counter for Terminate-Requests. + Max-Terminate indicates the number of Terminate-Request packets + sent without receiving a Terminate-Ack before assuming that the + peer is unable to respond. Max-Terminate MUST be configurable, + but SHOULD default to two (2) transmissions. + + Max-Configure + + A similar counter is recommended for Configure-Requests. Max- + Configure indicates the number of Configure-Request packets sent + without receiving a valid Configure-Ack, Configure-Nak or + Configure-Reject before assuming that the peer is unable to + respond. Max-Configure MUST be configurable, but SHOULD default + to ten (10) transmissions. + + + + +Simpson [Page 24] + RFC 1661 Point-to-Point Protocol July 1994 + + + Max-Failure + + A related counter is recommended for Configure-Nak. Max-Failure + indicates the number of Configure-Nak packets sent without sending + a Configure-Ack before assuming that configuration is not + converging. Any further Configure-Nak packets for peer requested + options are converted to Configure-Reject packets, and locally + desired options are no longer appended. Max-Failure MUST be + configurable, but SHOULD default to five (5) transmissions. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Simpson [Page 25] + RFC 1661 Point-to-Point Protocol July 1994 + + +5. LCP Packet Formats + + There are three classes of LCP packets: + + 1. Link Configuration packets used to establish and configure a + link (Configure-Request, Configure-Ack, Configure-Nak and + Configure-Reject). + + 2. Link Termination packets used to terminate a link (Terminate- + Request and Terminate-Ack). + + 3. Link Maintenance packets used to manage and debug a link + (Code-Reject, Protocol-Reject, Echo-Request, Echo-Reply, and + Discard-Request). + + In the interest of simplicity, there is no version field in the LCP + packet. A correctly functioning LCP implementation will always + respond to unknown Protocols and Codes with an easily recognizable + LCP packet, thus providing a deterministic fallback mechanism for + implementations of other versions. + + Regardless of which Configuration Options are enabled, all LCP Link + Configuration, Link Termination, and Code-Reject packets (codes 1 + through 7) are always sent as if no Configuration Options were + negotiated. In particular, each Configuration Option specifies a + default value. This ensures that such LCP packets are always + recognizable, even when one end of the link mistakenly believes the + link to be open. + + Exactly one LCP packet is encapsulated in the PPP Information field, + where the PPP Protocol field indicates type hex c021 (Link Control + Protocol). + + A summary of the Link Control Protocol packet format is shown below. + The fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Data ... + +-+-+-+-+ + + + Code + + The Code field is one octet, and identifies the kind of LCP + + + +Simpson [Page 26] + RFC 1661 Point-to-Point Protocol July 1994 + + + packet. When a packet is received with an unknown Code field, a + Code-Reject packet is transmitted. + + Up-to-date values of the LCP Code field are specified in the most + recent "Assigned Numbers" RFC [2]. This document concerns the + following values: + + 1 Configure-Request + 2 Configure-Ack + 3 Configure-Nak + 4 Configure-Reject + 5 Terminate-Request + 6 Terminate-Ack + 7 Code-Reject + 8 Protocol-Reject + 9 Echo-Request + 10 Echo-Reply + 11 Discard-Request + + + Identifier + + The Identifier field is one octet, and aids in matching requests + and replies. When a packet is received with an invalid Identifier + field, the packet is silently discarded without affecting the + automaton. + + Length + + The Length field is two octets, and indicates the length of the + LCP packet, including the Code, Identifier, Length and Data + fields. The Length MUST NOT exceed the MRU of the link. + + Octets outside the range of the Length field are treated as + padding and are ignored on reception. When a packet is received + with an invalid Length field, the packet is silently discarded + without affecting the automaton. + + Data + + The Data field is zero or more octets, as indicated by the Length + field. The format of the Data field is determined by the Code + field. + + + + + + + + +Simpson [Page 27] + RFC 1661 Point-to-Point Protocol July 1994 + + +5.1. Configure-Request + + Description + + An implementation wishing to open a connection MUST transmit a + Configure-Request. The Options field is filled with any desired + changes to the link defaults. Configuration Options SHOULD NOT be + included with default values. + + Upon reception of a Configure-Request, an appropriate reply MUST + be transmitted. + + A summary of the Configure-Request packet format is shown below. The + fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Options ... + +-+-+-+-+ + + + Code + + 1 for Configure-Request. + + Identifier + + The Identifier field MUST be changed whenever the contents of the + Options field changes, and whenever a valid reply has been + received for a previous request. For retransmissions, the + Identifier MAY remain unchanged. + + Options + + The options field is variable in length, and contains the list of + zero or more Configuration Options that the sender desires to + negotiate. All Configuration Options are always negotiated + simultaneously. The format of Configuration Options is further + described in a later chapter. + + + + + + + + + +Simpson [Page 28] + RFC 1661 Point-to-Point Protocol July 1994 + + +5.2. Configure-Ack + + Description + + If every Configuration Option received in a Configure-Request is + recognizable and all values are acceptable, then the + implementation MUST transmit a Configure-Ack. The acknowledged + Configuration Options MUST NOT be reordered or modified in any + way. + + On reception of a Configure-Ack, the Identifier field MUST match + that of the last transmitted Configure-Request. Additionally, the + Configuration Options in a Configure-Ack MUST exactly match those + of the last transmitted Configure-Request. Invalid packets are + silently discarded. + + A summary of the Configure-Ack packet format is shown below. The + fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Options ... + +-+-+-+-+ + + + Code + + 2 for Configure-Ack. + + Identifier + + The Identifier field is a copy of the Identifier field of the + Configure-Request which caused this Configure-Ack. + + Options + + The Options field is variable in length, and contains the list of + zero or more Configuration Options that the sender is + acknowledging. All Configuration Options are always acknowledged + simultaneously. + + + + + + + + +Simpson [Page 29] + RFC 1661 Point-to-Point Protocol July 1994 + + +5.3. Configure-Nak + + Description + + If every instance of the received Configuration Options is + recognizable, but some values are not acceptable, then the + implementation MUST transmit a Configure-Nak. The Options field + is filled with only the unacceptable Configuration Options from + the Configure-Request. All acceptable Configuration Options are + filtered out of the Configure-Nak, but otherwise the Configuration + Options from the Configure-Request MUST NOT be reordered. + + Options which have no value fields (boolean options) MUST use the + Configure-Reject reply instead. + + Each Configuration Option which is allowed only a single instance + MUST be modified to a value acceptable to the Configure-Nak + sender. The default value MAY be used, when this differs from the + requested value. + + When a particular type of Configuration Option can be listed more + than once with different values, the Configure-Nak MUST include a + list of all values for that option which are acceptable to the + Configure-Nak sender. This includes acceptable values that were + present in the Configure-Request. + + Finally, an implementation may be configured to request the + negotiation of a specific Configuration Option. If that option is + not listed, then that option MAY be appended to the list of Nak'd + Configuration Options, in order to prompt the peer to include that + option in its next Configure-Request packet. Any value fields for + the option MUST indicate values acceptable to the Configure-Nak + sender. + + On reception of a Configure-Nak, the Identifier field MUST match + that of the last transmitted Configure-Request. Invalid packets + are silently discarded. + + Reception of a valid Configure-Nak indicates that when a new + Configure-Request is sent, the Configuration Options MAY be + modified as specified in the Configure-Nak. When multiple + instances of a Configuration Option are present, the peer SHOULD + select a single value to include in its next Configure-Request + packet. + + Some Configuration Options have a variable length. Since the + Nak'd Option has been modified by the peer, the implementation + MUST be able to handle an Option length which is different from + + + +Simpson [Page 30] + RFC 1661 Point-to-Point Protocol July 1994 + + + the original Configure-Request. + + A summary of the Configure-Nak packet format is shown below. The + fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Options ... + +-+-+-+-+ + + + Code + + 3 for Configure-Nak. + + Identifier + + The Identifier field is a copy of the Identifier field of the + Configure-Request which caused this Configure-Nak. + + Options + + The Options field is variable in length, and contains the list of + zero or more Configuration Options that the sender is Nak'ing. + All Configuration Options are always Nak'd simultaneously. + + + +5.4. Configure-Reject + + Description + + If some Configuration Options received in a Configure-Request are + not recognizable or are not acceptable for negotiation (as + configured by a network administrator), then the implementation + MUST transmit a Configure-Reject. The Options field is filled + with only the unacceptable Configuration Options from the + Configure-Request. All recognizable and negotiable Configuration + Options are filtered out of the Configure-Reject, but otherwise + the Configuration Options MUST NOT be reordered or modified in any + way. + + On reception of a Configure-Reject, the Identifier field MUST + match that of the last transmitted Configure-Request. + Additionally, the Configuration Options in a Configure-Reject MUST + + + +Simpson [Page 31] + RFC 1661 Point-to-Point Protocol July 1994 + + + be a proper subset of those in the last transmitted Configure- + Request. Invalid packets are silently discarded. + + Reception of a valid Configure-Reject indicates that when a new + Configure-Request is sent, it MUST NOT include any of the + Configuration Options listed in the Configure-Reject. + + A summary of the Configure-Reject packet format is shown below. The + fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Options ... + +-+-+-+-+ + + + Code + + 4 for Configure-Reject. + + Identifier + + The Identifier field is a copy of the Identifier field of the + Configure-Request which caused this Configure-Reject. + + Options + + The Options field is variable in length, and contains the list of + zero or more Configuration Options that the sender is rejecting. + All Configuration Options are always rejected simultaneously. + + + + + + + + + + + + + + + + + + +Simpson [Page 32] + RFC 1661 Point-to-Point Protocol July 1994 + + +5.5. Terminate-Request and Terminate-Ack + + Description + + LCP includes Terminate-Request and Terminate-Ack Codes in order to + provide a mechanism for closing a connection. + + An implementation wishing to close a connection SHOULD transmit a + Terminate-Request. Terminate-Request packets SHOULD continue to + be sent until Terminate-Ack is received, the lower layer indicates + that it has gone down, or a sufficiently large number have been + transmitted such that the peer is down with reasonable certainty. + + Upon reception of a Terminate-Request, a Terminate-Ack MUST be + transmitted. + + Reception of an unelicited Terminate-Ack indicates that the peer + is in the Closed or Stopped states, or is otherwise in need of + re-negotiation. + + A summary of the Terminate-Request and Terminate-Ack packet formats + is shown below. The fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Data ... + +-+-+-+-+ + + + Code + + 5 for Terminate-Request; + + 6 for Terminate-Ack. + + Identifier + + On transmission, the Identifier field MUST be changed whenever the + content of the Data field changes, and whenever a valid reply has + been received for a previous request. For retransmissions, the + Identifier MAY remain unchanged. + + On reception, the Identifier field of the Terminate-Request is + copied into the Identifier field of the Terminate-Ack packet. + + + + +Simpson [Page 33] + RFC 1661 Point-to-Point Protocol July 1994 + + + Data + + The Data field is zero or more octets, and contains uninterpreted + data for use by the sender. The data may consist of any binary + value. The end of the field is indicated by the Length. + + + +5.6. Code-Reject + + Description + + Reception of a LCP packet with an unknown Code indicates that the + peer is operating with a different version. This MUST be reported + back to the sender of the unknown Code by transmitting a Code- + Reject. + + Upon reception of the Code-Reject of a code which is fundamental + to this version of the protocol, the implementation SHOULD report + the problem and drop the connection, since it is unlikely that the + situation can be rectified automatically. + + A summary of the Code-Reject packet format is shown below. The + fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Rejected-Packet ... + +-+-+-+-+-+-+-+-+ + + + Code + + 7 for Code-Reject. + + Identifier + + The Identifier field MUST be changed for each Code-Reject sent. + + Rejected-Packet + + The Rejected-Packet field contains a copy of the LCP packet which + is being rejected. It begins with the Information field, and does + not include any Data Link Layer headers nor an FCS. The + Rejected-Packet MUST be truncated to comply with the peer's + + + +Simpson [Page 34] + RFC 1661 Point-to-Point Protocol July 1994 + + + established MRU. + + + +5.7. Protocol-Reject + + Description + + Reception of a PPP packet with an unknown Protocol field indicates + that the peer is attempting to use a protocol which is + unsupported. This usually occurs when the peer attempts to + configure a new protocol. If the LCP automaton is in the Opened + state, then this MUST be reported back to the peer by transmitting + a Protocol-Reject. + + Upon reception of a Protocol-Reject, the implementation MUST stop + sending packets of the indicated protocol at the earliest + opportunity. + + Protocol-Reject packets can only be sent in the LCP Opened state. + Protocol-Reject packets received in any state other than the LCP + Opened state SHOULD be silently discarded. + + A summary of the Protocol-Reject packet format is shown below. The + fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Rejected-Protocol | Rejected-Information ... + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + Code + + 8 for Protocol-Reject. + + Identifier + + The Identifier field MUST be changed for each Protocol-Reject + sent. + + Rejected-Protocol + + The Rejected-Protocol field is two octets, and contains the PPP + Protocol field of the packet which is being rejected. + + + +Simpson [Page 35] + RFC 1661 Point-to-Point Protocol July 1994 + + + Rejected-Information + + The Rejected-Information field contains a copy of the packet which + is being rejected. It begins with the Information field, and does + not include any Data Link Layer headers nor an FCS. The + Rejected-Information MUST be truncated to comply with the peer's + established MRU. + + + +5.8. Echo-Request and Echo-Reply + + Description + + LCP includes Echo-Request and Echo-Reply Codes in order to provide + a Data Link Layer loopback mechanism for use in exercising both + directions of the link. This is useful as an aid in debugging, + link quality determination, performance testing, and for numerous + other functions. + + Upon reception of an Echo-Request in the LCP Opened state, an + Echo-Reply MUST be transmitted. + + Echo-Request and Echo-Reply packets MUST only be sent in the LCP + Opened state. Echo-Request and Echo-Reply packets received in any + state other than the LCP Opened state SHOULD be silently + discarded. + + + A summary of the Echo-Request and Echo-Reply packet formats is shown + below. The fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Magic-Number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Data ... + +-+-+-+-+ + + + Code + + 9 for Echo-Request; + + 10 for Echo-Reply. + + + +Simpson [Page 36] + RFC 1661 Point-to-Point Protocol July 1994 + + + Identifier + + On transmission, the Identifier field MUST be changed whenever the + content of the Data field changes, and whenever a valid reply has + been received for a previous request. For retransmissions, the + Identifier MAY remain unchanged. + + On reception, the Identifier field of the Echo-Request is copied + into the Identifier field of the Echo-Reply packet. + + Magic-Number + + The Magic-Number field is four octets, and aids in detecting links + which are in the looped-back condition. Until the Magic-Number + Configuration Option has been successfully negotiated, the Magic- + Number MUST be transmitted as zero. See the Magic-Number + Configuration Option for further explanation. + + Data + + The Data field is zero or more octets, and contains uninterpreted + data for use by the sender. The data may consist of any binary + value. The end of the field is indicated by the Length. + + + +5.9. Discard-Request + + Description + + LCP includes a Discard-Request Code in order to provide a Data + Link Layer sink mechanism for use in exercising the local to + remote direction of the link. This is useful as an aid in + debugging, performance testing, and for numerous other functions. + + Discard-Request packets MUST only be sent in the LCP Opened state. + On reception, the receiver MUST silently discard any Discard- + Request that it receives. + + + + + + + + + + + + + +Simpson [Page 37] + RFC 1661 Point-to-Point Protocol July 1994 + + + A summary of the Discard-Request packet format is shown below. The + fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Code | Identifier | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Magic-Number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Data ... + +-+-+-+-+ + + Code + + 11 for Discard-Request. + + Identifier + + The Identifier field MUST be changed for each Discard-Request + sent. + + Magic-Number + + The Magic-Number field is four octets, and aids in detecting links + which are in the looped-back condition. Until the Magic-Number + Configuration Option has been successfully negotiated, the Magic- + Number MUST be transmitted as zero. See the Magic-Number + Configuration Option for further explanation. + + Data + + The Data field is zero or more octets, and contains uninterpreted + data for use by the sender. The data may consist of any binary + value. The end of the field is indicated by the Length. + + + + + + + + + + + + + + + + +Simpson [Page 38] + RFC 1661 Point-to-Point Protocol July 1994 + + +6. LCP Configuration Options + + LCP Configuration Options allow negotiation of modifications to the + default characteristics of a point-to-point link. If a Configuration + Option is not included in a Configure-Request packet, the default + value for that Configuration Option is assumed. + + Some Configuration Options MAY be listed more than once. The effect + of this is Configuration Option specific, and is specified by each + such Configuration Option description. (None of the Configuration + Options in this specification can be listed more than once.) + + The end of the list of Configuration Options is indicated by the + Length field of the LCP packet. + + Unless otherwise specified, all Configuration Options apply in a + half-duplex fashion; typically, in the receive direction of the link + from the point of view of the Configure-Request sender. + + Design Philosophy + + The options indicate additional capabilities or requirements of + the implementation that is requesting the option. An + implementation which does not understand any option SHOULD + interoperate with one which implements every option. + + A default is specified for each option which allows the link to + correctly function without negotiation of the option, although + perhaps with less than optimal performance. + + Except where explicitly specified, acknowledgement of an option + does not require the peer to take any additional action other than + the default. + + It is not necessary to send the default values for the options in + a Configure-Request. + + + A summary of the Configuration Option format is shown below. The + fields are transmitted from left to right. + + 0 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Length | Data ... + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + + +Simpson [Page 39] + RFC 1661 Point-to-Point Protocol July 1994 + + + Type + + The Type field is one octet, and indicates the type of + Configuration Option. Up-to-date values of the LCP Option Type + field are specified in the most recent "Assigned Numbers" RFC [2]. + This document concerns the following values: + + 0 RESERVED + 1 Maximum-Receive-Unit + 3 Authentication-Protocol + 4 Quality-Protocol + 5 Magic-Number + 7 Protocol-Field-Compression + 8 Address-and-Control-Field-Compression + + + Length + + The Length field is one octet, and indicates the length of this + Configuration Option including the Type, Length and Data fields. + + If a negotiable Configuration Option is received in a Configure- + Request, but with an invalid or unrecognized Length, a Configure- + Nak SHOULD be transmitted which includes the desired Configuration + Option with an appropriate Length and Data. + + Data + + The Data field is zero or more octets, and contains information + specific to the Configuration Option. The format and length of + the Data field is determined by the Type and Length fields. + + When the Data field is indicated by the Length to extend beyond + the end of the Information field, the entire packet is silently + discarded without affecting the automaton. + + + + + + + + + + + + + + + + +Simpson [Page 40] + RFC 1661 Point-to-Point Protocol July 1994 + + +6.1. Maximum-Receive-Unit (MRU) + + Description + + This Configuration Option may be sent to inform the peer that the + implementation can receive larger packets, or to request that the + peer send smaller packets. + + The default value is 1500 octets. If smaller packets are + requested, an implementation MUST still be able to receive the + full 1500 octet information field in case link synchronization is + lost. + + Implementation Note: + + This option is used to indicate an implementation capability. + The peer is not required to maximize the use of the capacity. + For example, when a MRU is indicated which is 2048 octets, the + peer is not required to send any packet with 2048 octets. The + peer need not Configure-Nak to indicate that it will only send + smaller packets, since the implementation will always require + support for at least 1500 octets. + + A summary of the Maximum-Receive-Unit Configuration Option format is + shown below. The fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Length | Maximum-Receive-Unit | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + Type + + 1 + + Length + + 4 + + Maximum-Receive-Unit + + The Maximum-Receive-Unit field is two octets, and specifies the + maximum number of octets in the Information and Padding fields. + It does not include the framing, Protocol field, FCS, nor any + transparency bits or bytes. + + + + +Simpson [Page 41] + RFC 1661 Point-to-Point Protocol July 1994 + + +6.2. Authentication-Protocol + + Description + + On some links it may be desirable to require a peer to + authenticate itself before allowing network-layer protocol packets + to be exchanged. + + This Configuration Option provides a method to negotiate the use + of a specific protocol for authentication. By default, + authentication is not required. + + An implementation MUST NOT include multiple Authentication- + Protocol Configuration Options in its Configure-Request packets. + Instead, it SHOULD attempt to configure the most desirable + protocol first. If that protocol is Configure-Nak'd, then the + implementation SHOULD attempt the next most desirable protocol in + the next Configure-Request. + + The implementation sending the Configure-Request is indicating + that it expects authentication from its peer. If an + implementation sends a Configure-Ack, then it is agreeing to + authenticate with the specified protocol. An implementation + receiving a Configure-Ack SHOULD expect the peer to authenticate + with the acknowledged protocol. + + There is no requirement that authentication be full-duplex or that + the same protocol be used in both directions. It is perfectly + acceptable for different protocols to be used in each direction. + This will, of course, depend on the specific protocols negotiated. + + A summary of the Authentication-Protocol Configuration Option format + is shown below. The fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Length | Authentication-Protocol | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Data ... + +-+-+-+-+ + + + Type + + 3 + + + + + +Simpson [Page 42] + RFC 1661 Point-to-Point Protocol July 1994 + + + Length + + >= 4 + + Authentication-Protocol + + The Authentication-Protocol field is two octets, and indicates the + authentication protocol desired. Values for this field are always + the same as the PPP Protocol field values for that same + authentication protocol. + + Up-to-date values of the Authentication-Protocol field are + specified in the most recent "Assigned Numbers" RFC [2]. Current + values are assigned as follows: + + Value (in hex) Protocol + + c023 Password Authentication Protocol + c223 Challenge Handshake Authentication Protocol + + + Data + + The Data field is zero or more octets, and contains additional + data as determined by the particular protocol. + + + +6.3. Quality-Protocol + + Description + + On some links it may be desirable to determine when, and how + often, the link is dropping data. This process is called link + quality monitoring. + + This Configuration Option provides a method to negotiate the use + of a specific protocol for link quality monitoring. By default, + link quality monitoring is disabled. + + The implementation sending the Configure-Request is indicating + that it expects to receive monitoring information from its peer. + If an implementation sends a Configure-Ack, then it is agreeing to + send the specified protocol. An implementation receiving a + Configure-Ack SHOULD expect the peer to send the acknowledged + protocol. + + There is no requirement that quality monitoring be full-duplex or + + + +Simpson [Page 43] + RFC 1661 Point-to-Point Protocol July 1994 + + + that the same protocol be used in both directions. It is + perfectly acceptable for different protocols to be used in each + direction. This will, of course, depend on the specific protocols + negotiated. + + A summary of the Quality-Protocol Configuration Option format is + shown below. The fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Length | Quality-Protocol | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Data ... + +-+-+-+-+ + + + Type + + 4 + + Length + + >= 4 + + Quality-Protocol + + The Quality-Protocol field is two octets, and indicates the link + quality monitoring protocol desired. Values for this field are + always the same as the PPP Protocol field values for that same + monitoring protocol. + + Up-to-date values of the Quality-Protocol field are specified in + the most recent "Assigned Numbers" RFC [2]. Current values are + assigned as follows: + + Value (in hex) Protocol + + c025 Link Quality Report + + + Data + + The Data field is zero or more octets, and contains additional + data as determined by the particular protocol. + + + + + + +Simpson [Page 44] + RFC 1661 Point-to-Point Protocol July 1994 + + +6.4. Magic-Number + + Description + + This Configuration Option provides a method to detect looped-back + links and other Data Link Layer anomalies. This Configuration + Option MAY be required by some other Configuration Options such as + the Quality-Protocol Configuration Option. By default, the + Magic-Number is not negotiated, and zero is inserted where a + Magic-Number might otherwise be used. + + Before this Configuration Option is requested, an implementation + MUST choose its Magic-Number. It is recommended that the Magic- + Number be chosen in the most random manner possible in order to + guarantee with very high probability that an implementation will + arrive at a unique number. A good way to choose a unique random + number is to start with a unique seed. Suggested sources of + uniqueness include machine serial numbers, other network hardware + addresses, time-of-day clocks, etc. Particularly good random + number seeds are precise measurements of the inter-arrival time of + physical events such as packet reception on other connected + networks, server response time, or the typing rate of a human + user. It is also suggested that as many sources as possible be + used simultaneously. + + When a Configure-Request is received with a Magic-Number + Configuration Option, the received Magic-Number is compared with + the Magic-Number of the last Configure-Request sent to the peer. + If the two Magic-Numbers are different, then the link is not + looped-back, and the Magic-Number SHOULD be acknowledged. If the + two Magic-Numbers are equal, then it is possible, but not certain, + that the link is looped-back and that this Configure-Request is + actually the one last sent. To determine this, a Configure-Nak + MUST be sent specifying a different Magic-Number value. A new + Configure-Request SHOULD NOT be sent to the peer until normal + processing would cause it to be sent (that is, until a Configure- + Nak is received or the Restart timer runs out). + + Reception of a Configure-Nak with a Magic-Number different from + that of the last Configure-Nak sent to the peer proves that a link + is not looped-back, and indicates a unique Magic-Number. If the + Magic-Number is equal to the one sent in the last Configure-Nak, + the possibility of a looped-back link is increased, and a new + Magic-Number MUST be chosen. In either case, a new Configure- + Request SHOULD be sent with the new Magic-Number. + + If the link is indeed looped-back, this sequence (transmit + Configure-Request, receive Configure-Request, transmit Configure- + + + +Simpson [Page 45] + RFC 1661 Point-to-Point Protocol July 1994 + + + Nak, receive Configure-Nak) will repeat over and over again. If + the link is not looped-back, this sequence might occur a few + times, but it is extremely unlikely to occur repeatedly. More + likely, the Magic-Numbers chosen at either end will quickly + diverge, terminating the sequence. The following table shows the + probability of collisions assuming that both ends of the link + select Magic-Numbers with a perfectly uniform distribution: + + Number of Collisions Probability + -------------------- --------------------- + 1 1/2**32 = 2.3 E-10 + 2 1/2**32**2 = 5.4 E-20 + 3 1/2**32**3 = 1.3 E-29 + + + Good sources of uniqueness or randomness are required for this + divergence to occur. If a good source of uniqueness cannot be + found, it is recommended that this Configuration Option not be + enabled; Configure-Requests with the option SHOULD NOT be + transmitted and any Magic-Number Configuration Options which the + peer sends SHOULD be either acknowledged or rejected. In this + case, looped-back links cannot be reliably detected by the + implementation, although they may still be detectable by the peer. + + If an implementation does transmit a Configure-Request with a + Magic-Number Configuration Option, then it MUST NOT respond with a + Configure-Reject when it receives a Configure-Request with a + Magic-Number Configuration Option. That is, if an implementation + desires to use Magic Numbers, then it MUST also allow its peer to + do so. If an implementation does receive a Configure-Reject in + response to a Configure-Request, it can only mean that the link is + not looped-back, and that its peer will not be using Magic- + Numbers. In this case, an implementation SHOULD act as if the + negotiation had been successful (as if it had instead received a + Configure-Ack). + + The Magic-Number also may be used to detect looped-back links + during normal operation, as well as during Configuration Option + negotiation. All LCP Echo-Request, Echo-Reply, and Discard- + Request packets have a Magic-Number field. If Magic-Number has + been successfully negotiated, an implementation MUST transmit + these packets with the Magic-Number field set to its negotiated + Magic-Number. + + The Magic-Number field of these packets SHOULD be inspected on + reception. All received Magic-Number fields MUST be equal to + either zero or the peer's unique Magic-Number, depending on + whether or not the peer negotiated a Magic-Number. + + + +Simpson [Page 46] + RFC 1661 Point-to-Point Protocol July 1994 + + + Reception of a Magic-Number field equal to the negotiated local + Magic-Number indicates a looped-back link. Reception of a Magic- + Number other than the negotiated local Magic-Number, the peer's + negotiated Magic-Number, or zero if the peer didn't negotiate one, + indicates a link which has been (mis)configured for communications + with a different peer. + + Procedures for recovery from either case are unspecified, and may + vary from implementation to implementation. A somewhat + pessimistic procedure is to assume a LCP Down event. A further + Open event will begin the process of re-establishing the link, + which can't complete until the looped-back condition is + terminated, and Magic-Numbers are successfully negotiated. A more + optimistic procedure (in the case of a looped-back link) is to + begin transmitting LCP Echo-Request packets until an appropriate + Echo-Reply is received, indicating a termination of the looped- + back condition. + + A summary of the Magic-Number Configuration Option format is shown + below. The fields are transmitted from left to right. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Length | Magic-Number + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Magic-Number (cont) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + Type + + 5 + + Length + + 6 + + Magic-Number + + The Magic-Number field is four octets, and indicates a number + which is very likely to be unique to one end of the link. A + Magic-Number of zero is illegal and MUST always be Nak'd, if it is + not Rejected outright. + + + + + + + +Simpson [Page 47] + RFC 1661 Point-to-Point Protocol July 1994 + + +6.5. Protocol-Field-Compression (PFC) + + Description + + This Configuration Option provides a method to negotiate the + compression of the PPP Protocol field. By default, all + implementations MUST transmit packets with two octet PPP Protocol + fields. + + PPP Protocol field numbers are chosen such that some values may be + compressed into a single octet form which is clearly + distinguishable from the two octet form. This Configuration + Option is sent to inform the peer that the implementation can + receive such single octet Protocol fields. + + As previously mentioned, the Protocol field uses an extension + mechanism consistent with the ISO 3309 extension mechanism for the + Address field; the Least Significant Bit (LSB) of each octet is + used to indicate extension of the Protocol field. A binary "0" as + the LSB indicates that the Protocol field continues with the + following octet. The presence of a binary "1" as the LSB marks + the last octet of the Protocol field. Notice that any number of + "0" octets may be prepended to the field, and will still indicate + the same value (consider the two binary representations for 3, + 00000011 and 00000000 00000011). + + When using low speed links, it is desirable to conserve bandwidth + by sending as little redundant data as possible. The Protocol- + Field-Compression Configuration Option allows a trade-off between + implementation simplicity and bandwidth efficiency. If + successfully negotiated, the ISO 3309 extension mechanism may be + used to compress the Protocol field to one octet instead of two. + The large majority of packets are compressible since data + protocols are typically assigned with Protocol field values less + than 256. + + Compressed Protocol fields MUST NOT be transmitted unless this + Configuration Option has been negotiated. When negotiated, PPP + implementations MUST accept PPP packets with either double-octet + or single-octet Protocol fields, and MUST NOT distinguish between + them. + + The Protocol field is never compressed when sending any LCP + packet. This rule guarantees unambiguous recognition of LCP + packets. + + When a Protocol field is compressed, the Data Link Layer FCS field + is calculated on the compressed frame, not the original + + + +Simpson [Page 48] + RFC 1661 Point-to-Point Protocol July 1994 + + + uncompressed frame. + + A summary of the Protocol-Field-Compression Configuration Option + format is shown below. The fields are transmitted from left to + right. + + 0 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + Type + + 7 + + Length + + 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Simpson [Page 49] + RFC 1661 Point-to-Point Protocol July 1994 + + +6.6. Address-and-Control-Field-Compression (ACFC) + + Description + + This Configuration Option provides a method to negotiate the + compression of the Data Link Layer Address and Control fields. By + default, all implementations MUST transmit frames with Address and + Control fields appropriate to the link framing. + + Since these fields usually have constant values for point-to-point + links, they are easily compressed. This Configuration Option is + sent to inform the peer that the implementation can receive + compressed Address and Control fields. + + If a compressed frame is received when Address-and-Control-Field- + Compression has not been negotiated, the implementation MAY + silently discard the frame. + + The Address and Control fields MUST NOT be compressed when sending + any LCP packet. This rule guarantees unambiguous recognition of + LCP packets. + + When the Address and Control fields are compressed, the Data Link + Layer FCS field is calculated on the compressed frame, not the + original uncompressed frame. + + A summary of the Address-and-Control-Field-Compression configuration + option format is shown below. The fields are transmitted from left + to right. + + 0 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + Type + + 8 + + Length + + 2 + + + + + + + +Simpson [Page 50] + RFC 1661 Point-to-Point Protocol July 1994 + + +Security Considerations + + Security issues are briefly discussed in sections concerning the + Authentication Phase, the Close event, and the Authentication- + Protocol Configuration Option. + + + +References + + [1] Perkins, D., "Requirements for an Internet Standard Point-to- + Point Protocol", RFC 1547, Carnegie Mellon University, + December 1993. + + [2] Reynolds, J., and Postel, J., "Assigned Numbers", STD 2, RFC + 1340, USC/Information Sciences Institute, July 1992. + + +Acknowledgements + + This document is the product of the Point-to-Point Protocol Working + Group of the Internet Engineering Task Force (IETF). Comments should + be submitted to the ietf-ppp@merit.edu mailing list. + + Much of the text in this document is taken from the working group + requirements [1]; and RFCs 1171 & 1172, by Drew Perkins while at + Carnegie Mellon University, and by Russ Hobby of the University of + California at Davis. + + William Simpson was principally responsible for introducing + consistent terminology and philosophy, and the re-design of the phase + and negotiation state machines. + + Many people spent significant time helping to develop the Point-to- + Point Protocol. The complete list of people is too numerous to list, + but the following people deserve special thanks: Rick Adams, Ken + Adelman, Fred Baker, Mike Ballard, Craig Fox, Karl Fox, Phill Gross, + Kory Hamzeh, former WG chair Russ Hobby, David Kaufman, former WG + chair Steve Knowles, Mark Lewis, former WG chair Brian Lloyd, John + LoVerso, Bill Melohn, Mike Patton, former WG chair Drew Perkins, Greg + Satz, John Shriver, Vernon Schryver, and Asher Waldfogel. + + Special thanks to Morning Star Technologies for providing computing + resources and network access support for writing this specification. + + + + + + + +Simpson [Page 51] + RFC 1661 Point-to-Point Protocol July 1994 + + +Chair's Address + + The working group can be contacted via the current chair: + + Fred Baker + Advanced Computer Communications + 315 Bollay Drive + Santa Barbara, California 93117 + + fbaker@acc.com + + + +Editor's Address + + Questions about this memo can also be directed to: + + William Allen Simpson + Daydreamer + Computer Systems Consulting Services + 1384 Fontaine + Madison Heights, Michigan 48071 + + Bill.Simpson@um.cc.umich.edu + bsimpson@MorningStar.com + + + + + + + + + + + + + + + + + + + + + + + + + + +Simpson [Page 52] + + diff --git a/ext/picotcp/RFC/rfc1693.txt b/ext/picotcp/RFC/rfc1693.txt new file mode 100644 index 0000000..0ee3e3f --- /dev/null +++ b/ext/picotcp/RFC/rfc1693.txt @@ -0,0 +1,2019 @@ + + + + + + +Network Working Group T. Connolly +Request for Comments: 1693 P. Amer +Category: Experimental P. Conrad + University of Delaware + November 1994 + + + An Extension to TCP : Partial Order Service + +Status of This Memo + + This memo defines an Experimental Protocol for the Internet + community. This memo does not specify an Internet standard of any + kind. Discussion and suggestions for improvement are requested. + Distribution of this memo is unlimited + +IESG Note: + + Note that the work contained in this memo does not describe an + Internet standard. The Transport AD and Transport Directorate do not + recommend the implementation of the TCP modifications described. + However, outside the context of TCP, we find that the memo offers a + useful analysis of how misordered and incomplete data may be handled. + See, for example, the discussion of Application Layer Framing by D. + Clark and D. Tennenhouse in, "Architectural Considerations for a New + Generation of Protocols", SIGCOM 90 Proceedings, ACM, September 1990. + +Abstract + + This RFC introduces a new transport mechanism for TCP based upon + partial ordering. The aim is to present the concepts of partial + ordering and promote discussions on its usefulness in network + communications. Distribution of this memo is unlimited. + +Introduction + + A service which allows partial order delivery and partial reliability + is one which requires some, but not all objects to be received in the + order transmitted while also allowing objects to be transmitted + unreliably (i.e., some may be lost). + + The realization of such a service requires, (1) communication and/or + negotiation of what constitutes a valid ordering and/or loss-level, + and (2) an algorithm which enables the receiver to ascertain the + deliverability of objects as they arrive. These issues are addressed + here - both conceptually and formally - summarizing the results of + research and initial implementation efforts. + + + + +Connolly, Amer & Conrad [Page 1] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + The authors envision the use of a partial order service within a + connection-oriented, transport protocol such as TCP providing a + further level of granularity to the transport user in terms of the + type and quality of offered service. This RFC focuses specifically + on extending TCP to provide partial order connections. + + The idea of a partial order service is not limited to TCP. It may be + considered a useful option for any transport protocol and we + encourage researchers and practitioners to investigate further the + most effective uses for partial ordering whether in a next-generation + TCP, or another general purpose protocol such as XTP, or perhaps + within a "special purpose" protocol tailored to a specific + application and network profile. + + Finally, while the crux of this RFC relates to and introduces a new + way of considering object ordering, a number of other classic + transport mechanisms are also seen in a new light - among these are + reliability, window management and data acknowledgments. + + Keywords: partial order, quality of service, reliability, multimedia, + client/server database, Windows, transport protocol + +Table of Contents + + 1. Introduction and motivation .................................. 3 + 2. Partial Order Delivery ....................................... 4 + 2.1 Example 1: Remote Database .................................. 4 + 2.2 Example 2: Multimedia ....................................... 8 + 2.3 Example 3: Windows Screen Refresh ........................... 9 + 2.4 Potential Savings ........................................... 10 + 3. Reliability vs. Order ........................................ 12 + 3.1 Reliability Classes ......................................... 13 + 4. Partial Order Connection ..................................... 15 + 4.1 Connection Establishment .................................... 16 + 4.2 Data Transmission ........................................... 19 + 4.2.1 Sender .................................................... 22 + 4.2.2 Receiver .................................................. 25 + 5. Quantifying and Comparing Partial Order Services ............. 30 + 6. Future Direction ............................................. 31 + 7. Summary ...................................................... 32 + 8. References ................................................... 34 + Security Considerations ......................................... 35 + Authors' Addresses .............................................. 36 + + + + + + + + +Connolly, Amer & Conrad [Page 2] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + +1. Introduction and motivation + + Current applications that need to communicate objects (i.e., octets, + packets, frames, protocol data units) usually choose between a fully + ordered service such as that currently provided by TCP and one that + does not guarantee any ordering such as that provided by UDP. A + similar "all-or-nothing" choice is made for object reliability - + reliable connections which guarantee all objects will be delivered + verses unreliable data transport which makes no guarantee. What is + more appropriate for some applications is a partial order and/or + partial reliability service where a subset of objects being + communicated must arrive in the order transmitted, yet some objects + may arrive in a different order, and some (well specified subset) of + the objects may not arrive at all. + + One motivating application for a partial order service is the + emerging area of multimedia communications. Multimedia traffic is + often characterized either by periodic, synchronized parallel streams + of information (e.g., combined audio-video), or by structured image + streams (e.g., displays of multiple overlapping and nonoverlapping + windows). These applications have a high degree of tolerance for + less-than-fully-ordered data transport as well as data loss. Thus + they are ideal candidates for using a partial order, partial + reliability service. In general, any application which communicates + parallel and/or independent data structures may potentially be able + to profit from a partial order service. + + A second application that could benefit from a partial order service + involves remote or distributed databases. Imagine the case where a + database user transmitting queries to a remote server expects objects + (or records) to be returned in some order, although not necessarily + total order. For example a user writing an SQL data query might + specify this with the "order by" clause. There exist today a great + number of commercial implementations of distributed databases which + utilize - and thus are penalized by - an ordered delivery service. + + Currently these applications must use and pay for a fully + ordered/fully reliable service even though they do not need it. The + introduction of partial services allows applications to lower the + demanded quality of service (QOS) of the communication assuming that + such a service is more efficient and less costly. In effect, a + partial order extends the service level from two extremes - ordered + and unordered - to a range of discreet values encompassing both of + the extremes and all possible partial orderings in between. A + similar phenomenon is demonstrated in the area of reliability. + + + + + + +Connolly, Amer & Conrad [Page 3] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + It is worth mentioning that a TCP implementation providing a partial + order service, as described here, would be able to communicate with a + non-partial order implementation simply by recognizing this fact at + connection establishment - hence this extension is backward + compatible with earlier versions of TCP. Furthermore, it is + conceivable for a host to support the sending-half (or receiving- + half) of a partial order connection alone to reduce the size of the + TCP as well as the effort involved in the implementation. Similar + "levels of conformance" have been proposed in other internet + extensions such as [Dee89] involving IP multicasting. + + This RFC proceeds as follows. The principles of partial order + delivery, published in [ACCD93a], are presented in Section 2. The + notion of partial reliability, published in [ACCD93b], is introduced + in Section 3 followed by an explanation of "reliability classes". + Then, the practical issues involved with setting up and maintaining a + Partial Order Connection (POC) within a TCP framework are addressed + in Section 4 looking first at connection establishment, and then + discussing the sender's role and the receiver's role. Section 5 + provides insights into the expected performance improvements of a + partial order service over an ordered service and Section 6 discusses + some future directions. Concluding remarks are given in Section 7. + +2. Partial Order Delivery + + Partial order services are needed and can be employed as soon as a + complete ordering is not mandatory. When two objects can be + delivered in either order, there is no need to use an ordered service + that must delay delivery of the second one transmitted until the + first arrives as the following examples demonstrate. + +2.1 Example 1: Remote Database + + Simpson's Sporting Goods (SSG) has recently installed a state-of- + the-art enterprise-wide network. Their first "network application" + is a client/server SQL database with the following four records, + numbered {1 2 3 4} for convenience: + + SALESPERSON LOCATION CHARGES DESCRIPTION + ------------- ----------------- --------- ----------------- + 1 Anderson Atlanta, GA $4,200 Camping Gear + 2 Baker Boston, MA $849 Camping Gear + 3 Crowell Boston, MA $9,500 Sportswear + 4 Dykstra Wash., DC $1,000 Sportswear + + SSG employees running the client-side of the application can query + the database server from any location in the enterprise net using + standard SQL commands and the results will be displayed on their + + + +Connolly, Amer & Conrad [Page 4] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + screen. From the employee's perspective, the network is completely + reliable and delivers data (records) in an order that conforms to + their SQL request. In reality though, it is the transport layer + protocol which provides the reliability and order on top of an + unreliable network layer - one which introduces loss, duplication, + and disorder. + + Consider the four cases in Figure 1 - in the first query (1.a), + ordered by SALESPERSON, the records have only one acceptable order at + the destination, 1,2,3,4. This is evident due to the fact that there + are four distinct salespersons. If record 2 is received before + record 1 due to a network loss during transmission, the transport + service can not deliver it and must therefore buffer it until record + 1 arrives. An ordered service, also referred to as a virtual circuit + or FIFO channel, provides the desired level of service in this case. + + At the other extreme, an unordered service is motivated in Figure 1.d + where the employee has implicitly specified that any ordering is + valid simply by omitting the "order by" clause. Here any of 4! = 24 + delivery orderings would satisfy the application, or from the + transport layer's point of view, all records are immediately + deliverable as soon as they arrive from the network. No record needs + to buffered should it arrive out of sequential order. As notation, 4 + ordered objects are written 1;2;3;4 and 4 unordered objects are + written using a parallel operator: 1||2||3||4. + + Figures 1.b and 1.c demonstrate two possible partial orders that + permit 2 and 4 orderings respectively at the destination. Using the + notation just described, the valid orderings for the query in 1.b are + specified as 1;(2||3);4, which is to say that record 1 must be + delivered first followed by record 2 and 3 in either order followed + by record 4. Likewise, the ordering for 1.c is (1||2);(3||4). In + these two cases, an ordered service is too strict and an unordered + service is not strict enough. + + + + + + + + + + + + + + + + + +Connolly, Amer & Conrad [Page 5] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + +-----------------------------------------------------------------+ + | SELECT SALESPERSON, LOCATION, CHARGES, DESCRIPTION | + | FROM BILLING_TABLE | + | | + | SALESPERSON LOCATION CHARGES DESCRIPTION | + | ------------- ----------------- --------- --------------- | + | 1 Anderson Atlanta, GA $4,200 Camping Gear | + | 2 Baker Boston, MA $849 Camping Gear | + | 3 Crowell Boston, MA $9,500 Sportswear | + | 4 Dykstra Wash., DC $1,000 Sportswear | + +=================================================================+ + |a - ORDER BY SALESPERSON | + | | + | 1,2,3,4 1,2,3,4 | + | | + | Sender -----------> NETWORK --------------> Receiver | + | (1 valid ordering) | + +-----------------------------------------------------------------+ + |b - ORDER BY LOCATION | + | 1,2,3,4 | + | 1,2,3,4 1,3,2,4 | + | | + | Sender -----------> NETWORK --------------> Receiver | + | (2 valid orderings) | + +-----------------------------------------------------------------+ + |c - ORDER BY DESCRIPTION | + | 1,2,3,4 | + | 2,1,3,4 | + | 1,2,3,4 1,2,4,3 | + | 2,1,4,3 | + | | + | Sender -----------> NETWORK --------------> Receiver | + | (4 valid orderings) | + +-----------------------------------------------------------------+ + |d - (no order by clause) | + | 1,2,3,4 | + | 1,2,4,3 | + | 1,2,3,4 ... | + | 4,3,2,1 | + | | + | Sender -----------> NETWORK --------------> Receiver | + | (4!=24 valid orderings) | + +-----------------------------------------------------------------+ + Figure 1: Ordered vs. Partial Ordered vs. Unordered Delivery + + It is vital for the transport layer to recognize the exact + requirements of the application and to ensure that these are met. + However, there is no inherent need to exceed these requirements; on + + + +Connolly, Amer & Conrad [Page 6] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + the contrary, by exceeding these requirements unecessary resources + are consumed. This example application requires a reliable + connection - all records must eventually be delivered - but has some + flexibility when it comes to record ordering. + + In this example, each query has a different partial order. In total, + there exist 16 different partial orders for the desired 4 records. + For an arbitrary number of objects N, there exist many possible + partial orders each of which accepts some number of valid orderings + between 1 and N! (which correspond to the ordered and unordered + cases respectively). For some classes of partial orders, the number + of valid orderings can be calculated easily, for others this + calculation is intractable. An in-depth discussion on calculating + and comparing the number of orderings for a given partial order can + be found in [ACCD93a]. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Connolly, Amer & Conrad [Page 7] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + +2.2 Example 2: Multimedia + + A second example application that motivates a partial order service + is a multimedia broadcast involving video, audio and text components. + Consider an extended presentation of the evening news - extended to + include two distinct audio channels, a text subtitle and a closed- + captioned sign language video for the hearing impaired, in addition + to the normal video signal, as modeled by the following diagram. + + (left audio) (right audio) + +------+ +------+ + | ++++ | | ++++ | + | ++++ | | ++++ | + +------+ +------+ + =================================================== + I +---------------+I + I | |I + I | (hand signs) |I + I | |I + I +---------------+I + I I + I I + I (Main Video) I + I I + I I + I I + I I + I +------------------------------------------+ I + I | (text subtitle) | I + I +------------------------------------------+ I + I I + =================================================== + Figure 2: Multimedia broadcast example + + The multimedia signals have differing characteristics. The main video + signal may consist of full image graphics at a rate of 30 images/sec + while the video of hand signs requires a lower quality, say 10 + images/sec. Assume the audio signals are each divided into 60 sound + fragments/sec and the text object each second consists of either (1) + new text, (2) a command to keep the previous second of text, or (3) a + command for no subtitle. + + During a one-second interval of the broadcast, a sender transmits 30 + full-motion video images, 10 closed-captioned hand sign images, 60 + packets of a digitized audio signal for each of the audio streams and + a single text packet. The following diagram then might represent the + characteristics of the multimedia presentation in terms of the media + types, the number of each, and their ordering. Objects connected by a + + + +Connolly, Amer & Conrad [Page 8] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + horizontal line must be received in order, while those in parallel + have no inherent ordering requirement. + ++----------------------------------------------------------------------+ +| | +| |-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-...-o-|-o-|-o-| right audio | +| | | | | | | | | | | | | (60/sec) | +| | | | | | | | | | | | | | +| |-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-...-o-|-o-|-o-| left audio | +| | | | | | | | (60/sec) | +| | | | | | | | | +| |---o---|---o---|---o---|---o---|---...---|---o---| normal video | +| | | | (30/sec) | +| | | | | +| |-----------o-----------|--------o--...--------o--| hand signs | +| | | (10/sec) | +| | | | +| |-----------------------------o-----...-----------| text | +| | | (1/sec) | +| | ++----------------------------------------------------------------------+ + Figure 3: Object ordering in multimedia application + + Of particular interest to our discussion of partial ordering is the + fact that, while objects of a given media type generally must be + received in order, there exists flexibility between the separate + "streams" of multimedia data (where a "stream" represents the + sequence of objects for a specific media type). Another significant + characteristic of this example is the repeating nature of the object + orderings. Figure 3 represents a single, one-second, partial order + snapshot in a stream of possibly thousands of repeating sequential + periods of communication. + + It is assumed that further synchronization concerns in presenting the + objects are addressed by a service provided on top of the proposed + partial order service. Temporal ordering for synchronized playback + is considered, for example, in [AH91, HKN91]. + +2.3 Example 3: Windows Screen Refresh + + A third example to motivate a partial order service involves + refreshing a workstation screen/display containing multiple windows + from a remote source. In this case, objects (icons, still or video + images) that do not overlap have a "parallel" relationship (i.e., + their order of refreshing is independent) while overlapping screen + objects have a "sequential" relationship and should be delivered in + order. Therefore, the way in which the windows overlap induces a + partial order. + + + +Connolly, Amer & Conrad [Page 9] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Consider the two cases in Figure 4. A sender wishes to refresh a + remote display that contains four active windows (objects) named {1 2 + 3 4}. Assume the windows are transmitted in numerical order and the + receiving application refreshes windows as soon as the transport + service delivers them. If the windows are configured as in Figure + 4a, then there exist two different orderings for redisplay, namely + 1,2,3,4 or 1,3,2,4. If window 2 is received before window 1, the + transport service cannot deliver it or an incorrect image will be + displayed. In Figure 4b, the structure of the windows results in six + possible orderings - 1,2,3,4 or 1,3,2,4 or 1,3,4,2 or 3,4,1,2 or + 3,1,4,2 or 3,1,2,4. + + +================================+============================+ + |a +-----------+ |b +----------+ | + | | 1 | | | 1 | | + | | | | | +----------+ | + | +---------+ +----------+ | +-----| 2 | | + | | 2 |----| 3 | | | | | + | | +-----------+ | | +----------+ | + | | | 4 | | | +----------+ | + | +-----| |-------+ | | 3 | | + | | | | | +----------+ | + | +-----------+ | +------| 4 | | + | | | | | + | | +----------+ | + | | | + | 1;(2||3);4 | (1;2)||(3;4) | + +================================+============================+ + Figure 4: Window screen refresh + +2.4 Potential Savings + + In each of these examples, the valid orderings are strictly dependent + upon, and must be specified by the application. Intuitively, as the + number of acceptable orderings increases, the amount of resources + utilized by a partial order transport service, in terms of buffers + and retransmissions, should decrease as compared to a fully ordered + transport service thus also decreasing the overall cost of the + connection. Just how much lower will depend largely upon the + flexibility of the application and the quality of the underlying + network. + + As an indication of the potential for improved service, let us + briefly look at the case where a database has the following 14 + records. + + + + + + +Connolly, Amer & Conrad [Page 10] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + SALESPERSON LOCATION CHARGES DESCRIPTION + ------------- ----------------- --------- --------------- + 1 Anderson Washington $4,200 Camping Gear + 2 Anderson Philadelphia $2,000 Golf Equipment + 3 Anderson Boston $450 Bowling shoes + 4 Baker Boston $849 Sportswear + 5 Baker Washington $3,100 Weights + 6 Baker Washington $2000 Camping Gear + 7 Baker Atlanta $290 Baseball Gloves + 8 Baker Boston $1,500 Sportswear + 9 Crowell Boston $9,500 Camping Gear + 10 Crowell Philadelphia $6,000 Exercise Bikes + 11 Crowell New York $1,500 Sportswear + 12 Dykstra Atlanta $1,000 Sportswear + 13 Dykstra Dallas $15,000 Rodeo Gear + 14 Dykstra Miami $3,200 Golf Equipment + + Using formulas derived in [ACCD93a] one may calculate the total + number of valid orderings for any partial order that can be + represented in the notation mentioned previously. For the case where + a user specifies "ORDER BY SALESPERSON", the partial order above can + be expressed as, + + (1||2||3);(4||5||6||7||8);(9||10||11);(12||13||14) + + Of the 14!=87,178,291,200 total possible combinations, there exist + 25,920 valid orderings at the destination. A service that may + deliver the records in any of these 25,920 orderings has a great deal + more flexibility than in the ordered case where there is only 1 valid + order for 14 objects. It is interesting to consider the real + possibility of hundreds or even thousands of objects and the + potential savings in communication costs. + + In all cases, the underlying network is assumed to be unreliable and + may thus introduce loss, duplication, and disorder. It makes no + sense to put a partial order service on top of a reliable network. + While the exact amount of unreliability in a network may vary and is + not always well understood, initial experimental research indicates + that real world networks, for example the service provided by the + Internet's IP level, "yield high losses, duplicates and reorderings + of packets" [AS93,BCP93]. The authors plan to conduct further + experimentation into measuring Internet network unreliability. This + information would say a great deal about the practical merit of a + partial order service. + + + + + + + +Connolly, Amer & Conrad [Page 11] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + +3. Reliability vs. Order + + While TCP avoids the loss of even a single object, in fact for many + applications, there exists a genuine ability to tolerate loss. + Losing one frame per second in a 30 frame per second video or losing + a segment of its accompanying audio channel is usually not a problem. + Bearing this in mind, it is of value to consider a quality of service + that combines a partial order with a level of tolerated loss (partial + reliability). Traditionally there exist 4 services: reliable- + ordered, reliable-unordered, unreliable-ordered, and unreliable- + unordered. See Figure 5. Reliable-ordered service (denoted by a + single point) represents the case where all objects are delivered in + the order transmitted. File transfer is an example application + requiring such a service. + + reliable-ordered reliable-unordered + | | + | | + v v + zero loss-->*---------------------------------* + min loss-->|<-- |<-- + . | | + . |<-- |<-- + | | + |<-- unreliable- |<-- unreliable- + RELIABILITY | ordered | unordered + |<-- |<-- + | | + |<-- |<-- + max loss-->| | + +-+--+--+--+--+--+--+--+--+--+--+-+ + ordered partial ordered unordered + + ORDER + + Figure 5: Quality Of Service: Reliability vs. Order - + Traditional Service Types + + In a reliable-unordered service (also a single point), all objects + must be delivered, but not necessarily according to the order + transmitted; in fact, any order will suffice. Some transaction + processing applications such as credit card verification require such + a service. + + Unreliable-ordered service allows some objects to be lost. Those + that are delivered, however, must arrive in relative order (An + "unreliable" service does not necessarily lose objects; rather, it + may do so without failing to provide its advertised quality of + + + +Connolly, Amer & Conrad [Page 12] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + service; e.g., the postal system provides an unreliable service). + Since there are varying degrees of unreliability, this service is + represented by a set of points in Figure 5. An unreliable-ordered + service is applicable to packet-voice or teleconferencing + applications. + + Finally unreliable-unordered service allows objects to be lost and + delivered in any order. This is the kind of service used for normal + e-mail (without acknowledgment receipts) and electronic announcements + or junk e-mail. + + As mentioned previously, the concept of a partial order expands the + order dimension from the two extremes of ordered and unordered to a + range of discrete possibilities as depicted in Figure 6. + Additionally, as will be discussed presently, the notion of + reliability is extended to allow for varying degrees of reliability + on a per-object basis providing even greater flexibility and improved + resource utilization. + + reliable-PO + + | | | | | | | | | | | | + | | | | | | | | | | | | + v v v v v v v v v v v v + zero loss-->*---------------------------------* + min loss-->| . . . . . . . . . . . | + . | . . . . . . . . . . . | + . | . . . . . . . . . . . | + | . . . . . . | + RELIABILITY | . . . unreliable-PO . . . | + | . . . . . . . . . . . | + | . . . . . . . . . . . | + | . . . . . . . . . . . | + | . . . . . . . . . . . | + max loss-->| . . . . . . . . . . . | + +-+--+--+--+--+--+--+--+--+--+--+-+ + ordered partial ordered unordered + + ORDER + + Figure 6: Quality Of Service: Reliability vs. Order - Partial + Order Service + +3.1 Reliability Classes + + When considering unreliable service, one cannot assume that all + objects are equal with regards to their reliability. This + classification is reasonable if all objects are identical (e.g., + + + +Connolly, Amer & Conrad [Page 13] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + video frames in a 30 frame/second film). Many applications, such as + multimedia systems, however, often contain a variety of object types. + Thus three object reliability classes are proposed: BART-NL, BART-L, + and NBART-L. Objects are assigned to one of these classes depending + on their temporal value as will be show presently. + + BART-NL objects must be delivered to the destination. These objects + have temporal value that lasts for an entire established connection + and require reliable delivery (NL = No Loss allowed). An example of + BART-NL objects would be the database records in Example 2.1 or the + windows in the screen refresh in Example 2.3. If all objects are of + type BART-NL, the service is reliable. One possible way to assure + eventual delivery of a BART-NL object in a protocol is for the sender + to buffer it, start a timeout timer, and retransmit it if no ACK + arrives before the timeout. The receiver in turn returns an ACK when + the object has safely arrived and been delivered (BART = Buffers, + ACKs, Retransmissions, Timers). + + BART-L objects are those that have temporal value over some + intermediate amount of time - enough to permit timeout and + retransmission, but not everlasting. Once the temporal value of + these objects has expired, it is better to presume them lost than to + delay further the delivery pipeline of information. One possibility + for deciding when an object's usefulness has expired is to require + each object to contain information defining its precise temporal + value [DS93]. An example of a BART-L object would be a movie + subtitle, sent in parallel with associated film images, which is + valuable any time during a twenty second film sequence. If not + delivered sometime during the first ten seconds, the subtitle loses + its value and can be presumed lost. These objects are buffered- + ACKed-retransmitted up to a certain point in time and then presumed + lost. + + NBART-L objects are those with temporal values too short to bother + timing out and retransmitting. An example of a NBART-L object would + be a single packet of speech in a packetized phone conversation or + one image in a 30 image/sec film. A sender transmits these objects + once and the service makes a best effort to deliver them. If the one + attempt is unsuccessful, no further attempts are made. + + An obvious question comes to mind - what about NBART-NL objects? Do + such objects exist? The authors have considered the notion of + communicating an object without the use of BART and still being able + to provide a service without loss. Perhaps with the use of forward + error correction this may become a viable alternative and could + certainly be included in the protocol. However, for our purposes in + this document, only the first three classifications will be + considered. + + + +Connolly, Amer & Conrad [Page 14] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + While classic transport protocols generally treat all objects + equally, the sending and receiving functions of a protocol providing + partial order/partial reliability service will behave differently for + each class of object. For example, a sender buffers and, if + necessary, retransmits any BART-NL or BART-L objects that are not + acknowledged within a predefined timeout period. On the contrary, + NBART-L objects are forgotten as soon as they are transmitted. + +4. Partial Order Connection + + The implementation of a protocol that provides partial order service + requires, at a minimum, (1) communication of the partial ordering + between the two endpoints, and (2) dynamic evaluation of the + deliverability of objects as they arrive at the receiver. In + addition, this RFC describes the mechanisms needed to (3) initiate a + connection, (4) provide varying degrees of reliability for the + objects being transmitted, and (5) improve buffer utilization at the + sender based on object reliability. + + Throughout the discussion of these issues, the authors use the + generic notion of "objects" in describing the service details. Thus, + one of the underlying requirements of a partial order service is the + ability to handle such an abstraction (e.g., recognize object + boundaries). The details of object management are implementation + dependent and thus are not specified in this RFC. However, as this + represents a potential fundamental change to the TCP protocol, some + discussion is in order. + + At one extreme, it is possible to consider octets as objects and + require that the application specify the partial order accordingly + (octet by octet). This likely would entail an inordinate amount of + overhead, processing each octet on an individual basis (literally + breaking up contiguous segments to determine which, if any, octets + are deliverable and which are not). At the other extreme, the + transport protocol could maintain object atomicity regardless of size + - passing arbitrarily large data structures to IP for transmission. + At the sending side of the connection this would actually work since + IP is prepared to perform source fragmentation, however, there is no + guarantee that the receiving IP will be able to reassemble the + fragments! IP relies on the TCP max segment size to prevent this + situation from occurring[LMKQ89]. + + A more realistic approach given the existing IP constraints might be + to maintain the current notion of a TCP max segment size for the + lower-layer interface with IP while allowing a much larger object + size at the upper-layer interface. Of course this presents some + additional complexities. First of all, the transport layer will now + have to be concerned with fragmentation/reassembly of objects larger + + + +Connolly, Amer & Conrad [Page 15] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + than the max segment size and secondly, the increased object sizes + will require significantly more buffer space at the receiver if we + want to buffer the object until it arrives in entirety. + Alternatively, one may consider delivering "fragments" of an object + as they arrive as long as the ordering of the fragments is correct + and the application is able to process the fragments (this notion of + fragmented delivery is discussed further in Section 6). + +4.1 Connection Establishment + + By extending the transport paradigm to allow partial ordering and + reliability classes, a user application may be able to take advantage + of a more efficient data transport facility by negotiating the + optimal service level which is required - no more, no less. This is + accomplished by specifying these variables as QOS parameters or, in + TCP terminology, as options to be included in the TCP header [Pos81]. + + A TCP implementation that provides a partial order service requires + the use of two new TCP options. The first is an enabling option + "POC-permitted" (Partial Order Connection Permitted) that may be used + in a SYN segment to request a partial order service. The other is + the "POC-service-profile" option which is used periodically to + communicate the service characteristics. This second option may be + sent only after successful transmission and acknowledgment of the + POC-permitted option. + + A user process issuing either an active or passive OPEN may choose to + include the POC-permitted option if the application can benefit from + the use of a partial order service and in fact, in cases where the + viability of such service is unknown, it is suggested that the option + be used and that the decision be left to the user's peer. + + For example, a multimedia server might issue a passive with the + POC-permitted option in preparation for the connection by a remote + user. + + Upon reception of a segment with the POC-permitted option, the + receiving user has the option to respond with a similar POC-permitted + indication or may reject a partial order connection if the + application does not warrant the service or the receiving user is + simply unable to provide such a service (e.g., does not recognize the + POC-permitted option). + + In the event that simultaneous initial segments are exchanged, + the TCP will initiate a partial order connection only if both sides + include the POC-permitted option. + + + + + +Connolly, Amer & Conrad [Page 16] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + A brief example should help to demonstrate this procedure. The + following notation (a slight simplification on that employed in RFC + 793) will be used. Each line is numbered for reference purposes. + TCP-A (on the left) will play the role of the receiver and TCP-B will + be the sender. Right arrows (-->) indicate departure of a TCP + segment from TCP-A to TCP-B, or arrival of a segment at B from A. + Left arrows indicate the reverse. TCP states represent the state + AFTER the departure or arrival of the segment (whose contents are + shown in the center of the line). Liberties are taken with the + contents of the segments where only the fields of interest are shown. + + TCP-A TCP-B + + 1. CLOSED LISTEN + + 2. SYN-SENT --> --> SYN-RECEIVED + + 3. ESTABLISHED <-- <-- SYN-RECEIVED + + 4. ESTABLISHED --> --> ESTABLISHED + + Figure 7. Basic 3-Way handshake for a partial order connection + + In line 1 of Figure 7, the sending user has already issued a passive + OPEN with the POC-permitted option and is waiting for a connection. + In line 2, the receiving user issues an active OPEN with the same + option which in turn prompts TCP-A to send a SYN segment with the + POC-permitted option and enter the SYN-SENT state. TCP-B is able to + confirm the use of a PO connection and does so in line 3, after which + TCP-A enters the established state and completes the connection with + an ACK segment in line 4. + + In the event that either side is unable to provide partial order + service, the POC-permitted option will be omitted and normal TCP + processing will ensue. + + For completeness, the authors include the following specification for + both the POC-permitted option and the POC-service-profile option in a + format consistent with the TCP specification document [Pos81]. + + TCP POC-permitted Option: + + Kind: 9 Length: - 2 bytes + + +-----------+-------------+ + | Kind=9 | Length=2 | + +-----------+-------------+ + + + + +Connolly, Amer & Conrad [Page 17] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + TCP POC-service-profile Option: + + Kind: 10 Length: 3 bytes + + 1 bit 1 bit 6 bits + +----------+----------+------------+----------+--------+ + | Kind=10 | Length=3 | Start_flag | End_flag | Filler | + +----------+----------+------------+----------+--------+ + + The first option represents a simple indicator communicated between + the two peer transport entities and needs no further explanation. + The second option serves to communicate the information necessary to + carry out the job of the protocol - the type of information which is + typically found in the header of a TCP segment - and raises some + interesting questions. + + Standard TCP maintains a 60-byte maximum header size on all segments. + The obvious intuition behind this rule is that one would like to + minimize the amount of overhead information present in each packet + while simultaneously increasing the payload, or data, section. While + this is acceptable for most TCP connections today, a partial-order + service would necessarily require that significantly more control + information be passed between transport entities at certain points + during a connection. Maintaining the strict interpretation of this + rule would prove to be inefficient. If, for example, the service + profile occupied a total of 400 bytes (a modest amount as will be + confirmed in the next section), then one would have to fragment this + information across at least 10 segments, allocating 20 bytes per + segment for the normal TCP header. + + Instead, the authors propose that the service profile be carried in + the data section of the segment and that the 3-byte POC-service- + profile option described above be placed in the header to indicate + the presence of this information. Upon reception of such a segment, + the TCP extracts the service profile and uses it appropriately as + will be discussed in the following sections. + + The option itself, as shown here, contains two 1-bit flags necessary + to handle the case where the service profile does not fit in a single + TCP segment. The "Start_flag" indicates that the information in the + data section represents the beginning of the service profile and the + "End_flag" represents the converse. For service profiles which fit + completely in a single segment, both flags will be set to 1. + Otherwise, the Start_flag is set in the initial segment and the + End_flag in the final segment allowing the peer entity to reconstrcut + the entire service profile (using the normal sequence numbers in the + segment header). The "Filler" field serves merely to complete the + third byte of the option. + + + +Connolly, Amer & Conrad [Page 18] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Note that the length of the service profile may vary during the + connection as the order or reliability requirements of the user + change but this length must not exceed the buffering ability of the + peer TCP entity since the entire profile must be stored. The exact + makeup of this data structure is presented in Section 4.2. + +4.2 Data Transmission + + Examining the characteristics of a partial order TCP in chronological + fashion, one would start off with the establishment of a connection + as described in Section 4.1. After which, although both ends have + acknowledged the acceptability of partial order transport, neither + has actually begun a partial order transmission - in other words, + both the sending-side and the receiving-side are operating in a + normal, ordered-reliable mode. For the subsequent discussion, an + important distinction is made in the terms sending-side and + receiving-side which refer to the data flow from the sender and that + from the receiver, respectively. + + For the partial ordering to commence, the TCP must be made aware of + the acceptable object orderings and reliability for both the send- + side and receive-side of the connection for a given set of objects + (hereafter referred to as a "period"). This information is contained + in the service profile and it is the responsibility of the user + application to define this profile. Unlike standard TCP where + applications implicitly define a reliable, ordered profile; with + partial order TCP, the application must explicity define a profile. + + The representation of the service profile is one of the concerns for + the transport protocol. It would be useful if the TCP could encode a + partial ordering in as few bits as possible since these bits will be + transmitted to the destination each time the partial order changes. + A matrix representation appears to be well-suited to encoding the + partial order and a vector has been proposed to communicate and + manage the reliability aspects of the service. Temporal values may + be included within the objects themselves or may be defined as a + function of the state of the connection [DS93]. Using these data + structures, the complete service profile would include (1) a partial + order matrix, (2) a reliability vector and (3) an object_sizes vector + which represents the size of the objects in octets (see + [ACCD93a,CAC93] for a discussion on alternative structures for these + variables). + + Throughout this section, we use the following service profile as a + running example. Shown here is a partial order matrix and graphical + representation for a simple partial order with 6 objects - + ((1;2)||(3;4)||5);6. In the graphical diagram, arrows (-->) denote + sequential order and objects in parallel can be delivered in either + + + +Connolly, Amer & Conrad [Page 19] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + order. So in this example, object 2 must be delivered after object + 1, object 4 must be delivered after object 3, and object 6 must be + delivered after objects 1 through 5 have all been delivered. Among + the 6 objects, there are 30 valid orderings for this partial order + (each valid ordering is known as a linear extension of the partial + order). + + 1 2 3 4 5 6 + +-------------+ + 1 | - 1 0 0 0 1 | | | | + 2 | - - 0 0 0 1 | |-->1-->|-->2-->| | + 3 | - - - 1 0 1 | | | | + 4 | - - - - 0 1 | |-->3-->|-->4-->|-->6-->| + 5 | - - - - - 1 | | | | + 6 | - - - - - - | |------>5------>| | + +-------------+ | | | + + PO Matrix PO Graph + + + In the matrix, a 1 in row i of column j denotes that object i must be + delivered before object j. Note that if objects are numbered in any + way such that 1,2,3,...,N is a valid ordering, only the upper right + triangle of the transitively closed matrix is needed [ACCD93a]. + Thus, for N objects, the partial order can be encoded in (N*(N-1)/2) + bits. + + The reliability vector for the case where reliability classes are + enumerated types such as {BART-NL=1, BART-L=2, NBART-L = 3} and all + objects are BART-NL would simply be, <1, 1, 1, 1, 1, 1>. Together + with the object_sizes vector, the complete service profile is + described. + + This information must be packaged and communicated to the sending TCP + before the first object is transmitted using a TCP service primitive + or comparable means depending upon the User/TCP interface. Once the + service profile has been specified to the TCP, it remains in effect + until the connection is closed or the sending user specifies a new + service profile. In the event that the largest object size can not + be processed by the receiving TCP, the user application is informed + that the connection cannot be maintained and the normal connection + close procedure is followed. + + Typically, as has been described here, the service profile definition + and specification is handled at the sending end of the connection, + but there could be applications (such as the screen refresh) where + the receiving user has this knowledge. Under these circumstances the + receiving user is obliged to transmit the object ordering on the + + + +Connolly, Amer & Conrad [Page 20] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + return side of the connection (e.g., when making the request for a + screen refresh) and have the sender interpret this data to be used on + the send side of the connection. + + Requiring that the sending application specify the service profile is + not an arbitrary choice. To ensure proper object identification, the + receiving application must transmit the new object numbering to the + sending application (not the sending transport layer). Since the + sending application must receive this information in any case, it + simplifies matters greatly to require that the sending application be + the only side that may specify the service profile to the transport + layer. + + Consider now the layered architecture diagram in Figure 8 and assume + that a connection already is established. Let us now say that UserA + specifies the service profile for the sending-side of the connection + via its interface with TCP-A. TCP-A places the profile in the header + of one or more data packets (depending upon the size of the service + profile, the profile may require several packets), sets the POC- + service-profile option and passes it to IP for transmission over the + network. This packet must be transmitted reliably, therefore TCP-A + buffers it and starts a normal retransmit timer. Subsequently, the + service profile arrives at the destination node and is handed to + TCP-B (as indicated by the arrows in Figure 8). TCP-B returns an + acknowledgment and immediately adopts the service profile for one + direction of data flow over the connection. When the acknowledgment + arrives back at TCP-A, the cycle is complete and both sides are now + able to use the partial order service. + + +--------+ +----------+ + Service | UserA | | UserB | + Profile +--------+ +----------+ + | | | + | | | + v | | + | +---------+ +-----------+ Service + | | TCP-A | | TCP-B | Profile + | +---------+ +-----------+ ^ + | | | | + | | | | + | | | | + | +---------------------------------------+ | + v | | | + ------>| ---- Service Profile -------------> |-----> + +---------------------------------------+ + + Figure 8. Layered Communication Architecture + + + + +Connolly, Amer & Conrad [Page 21] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Note that one of the TCP entities learns of the profile via its user + interface, while the other TCP entity is informed via its network + interface. + + For the remaining discussions, we will assume that a partial order + profile has been successfully negotiated for a single direction of + the connection (as depicted in Figure 8) and that we may now speak of + a "sending TCP" (TCP-A) and a "receiving TCP" (TCP-B). As such, + TCP-A refers to the partial order data stream as the "send-side" of + the connection, while TCP-B refers to the same data stream as the + "receive-side". + + Having established a partial order connection, the communicating TCPs + each have their respective jobs to perform to ensure proper data + delivery. The sending TCP ascertains the object ordering and + reliability from the service profile and uses this information in its + buffering/retransmission policy. The receiver modifications are more + significant, particularly the issues of object deliverability and + reliability. And both sides will need to redefine the notion of + window management. Let us look specifically at how each side of the + TCP connection is managed under this new paradigm. + +4.2.1 Sender + + The sender's concerns are still essentially four-fold - transmitting + data, managing buffer space, processing acknowledgments and + retransmitting after a time-out - however, each takes on a new + meaning in a partial order service. Additionally, the management of + the service profile represents a fifth duty not previously needed. + + Taking a rather simplistic view, normal TCP output processing + involves (1) setting up the header, (2) copying user data into the + outgoing segment, (3) sending the segment, (4) making a copy in a + send buffer for retransmission and (5) starting a retransmission + timer. The only difference with a partial order service is that the + reliability vector must be examined to determine whether or not to + buffer the object and start a timer - if the object is classified as + NBART-L, then steps 4 and 5 are omitted. + + Buffer management at the sending end of a partial order connection is + dependent upon the object reliability class and the object size. + When transmitting NBART-L objects the sender need not store the data + for later possible retransmission since NBART-L objects are never + retransmitted. The details of buffer management - such as whether to + allocate fixed-size pools of memory, or perhaps utilize a dynamic + heap allocation strategy - are left to the particular system + implementer. + + + + +Connolly, Amer & Conrad [Page 22] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Acknowledgment processing remains essentially intact - + acknowledgments are cumulative and specify the peer TCP's window + advertisement. However, determination of this advertisement is no + longer a trivial process dependent only upon the available buffer + space (this is discussed further in Section 4.2.2). Moreover, it + should be noted that the introduction of partial ordering and partial + reliability presents several new and interesting alternatives for the + acknowledgment policy. The authors are investigating several of + these strategies through a simulation model and have included a brief + discussion of these issues in Section 6. + + The retransmit function of the TCP is entirely unchanged and is + therefore not discussed further. + + For some applications, it may be possible to maintain the same + partial order for multiple periods (e.g., the application repeats the + same partial order). In the general case, however, the protocol must + be able to change the service profile during an existing connection. + When a change in the service profile is requested, the sending TCP is + obliged to complete the processing of the current partial order + before commencing with a new one. This ensures consistency between + the user applications in the event of a connection failure and + simplifies the protocol (future study is planned to investigate the + performance improvement gained by allowing concurrent different + partial orders). The current partial order is complete when all + sending buffers are free. Then negotiation of the new service + profile is performed in the same manner as with the initial profile. + + Combining these issues, we propose the following simplified state + machine for the protocol (connection establishment and tear down + remains the same and is not show here). + + (1)Send Request (5)Ack Arrival + +------+ +-----------+ + | | | | + | V | | + +----------+ (4) New PO Profile +----------+ | + +---->| |----------------------->| PO |<-----+ + | | ESTAB | | | + (2) | | | | SETUP | + Ack +-----| |<-----------------------| |<-----+ + Arrival +----------+ (7)PO Setup Complete +----------+ | + ^ | | | + | | | | + +------+ +---------+ + (3)Timeout (6)Timeout + + + + + +Connolly, Amer & Conrad [Page 23] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Event (1) - User Makes a Data Send Request + ========= + If Piggyback Timer is set then + cancel piggyback timer + Package and send the object (with ACK for receive-side) + If object type = (BART-L,BART-NL) then + Store the object and start a retransmit timer + If sending window is full then + Block Event (1) - allow no further send requests from user + + Event (2) - ACK Arrives + ========= + If ACKed object(s) is buffered then + Release the buffer(s) and stop the retransmit timer(s) + Extract the peer TCP's window advertisement + If remote TCP's window advertisement > sending window then + Enable Event (1) + If remote TCP's window advertisement <= sending window then + Block Event (1) - allow no further send requests from user + Adjust sending window based on received window advertisement + + Event (3) - Retransmit Timer Expires + ========= + If Piggyback Timer is set then + cancel piggyback timer + Re-transmit the segment (with ACK for receive-side) + Restart the timer + + Event (4) - PO Service Profile Arrives at the User Interface + ========= + Transition to the PO SETUP state + Store the Send-side PO service profile + Package the profile into 1 or more segments, setting the + POC-Service-Profile option on each + If Piggyback Timer is set then + cancel piggyback timer + Send the segment(s) (with ACK for receive-side) + Store the segment(s) and start a retransmit timer + + Event (5) - ACK Arrival + ========= + If ACKed object(s) is buffered then + Release the buffer(s) and stop the retransmit timer(s) + Extract the peer TCP's window advertisement + If all objects from previous service profile have been ACKed and + the new service profile has been ACKed then enable Event (7) + + + + + +Connolly, Amer & Conrad [Page 24] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Event (6) - Retransmit Timer Expires + ========= + If Piggyback Timer is set then + cancel piggyback timer + Re-transmit the segment (with ACK for receive-side) + Restart the timer + + Event (7) - PO Setup Completed + ========= + Transition to the ESTAB state and begin processing new service + profile + +4.2.2 Receiver + + The receiving TCP has additional decisions to make involving object + deliverability, reliability and window management. Additionally, the + service profile must be established (and re-established) periodically + and some special processing must be performed at the end of each + period. + + When an object arrives, the question is no longer, "is this the next + deliverable object?", but rather, "is this ONE OF the next + deliverable objects?" Hence, it is convenient to think of a + "Deliverable Set" of objects with a partial order protocol. To + determine the elements of this set and answer the question of + deliverability, the receiver relies upon the partial order matrix + but, unlike the sender, the receiver dynamically updates the matrix + as objects are processed thus making other objects (possibly already + buffered objects) deliverable as well. A check of the object type + also must be performed since BART-NL and BART-L objects require an + ACK to be returned to the sender but NBART-L do not. Consider our + example from the previous section. + + 1 2 3 4 5 6 + +-------------+ + 1 | - 1 0 0 0 1 | | | | + 2 | - - 0 0 0 1 | |-->1-->|-->2-->| | + 3 | - - - 1 0 1 | | | | + 4 | - - - - 0 1 | |-->3-->|-->4-->|-->6-->| + 5 | - - - - - 1 | | | | + 6 | - - - - - - | |------>5------>| | + +-------------+ | | | + + PO Matrix PO Graph + + When object 5 arrives, the receiver scans column 5, finds that the + object is deliverable (since there are no 1's in the column) and + immediately delivers the object to the user application. Then, the + + + +Connolly, Amer & Conrad [Page 25] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + matrix is updated to remove the constraint of any object whose + delivery depends on object 5 by clearing all entries of row 5. This + may enable other objects to be delivered (for example, if object 2 is + buffered then the delivery of object 1 will make object 2 + deliverable). This leads us to the next issue - delivery of stored + objects. + + In general, whenever an object is delivered, the buffers must be + examined to see if any other stored object(s) becomes deliverable. + CAC93 describes an efficient algorithm to implement this processing + based on traversing the precedence graph. + + Consideration of object reliability is interesting. The authors have + taken a polling approach wherein a procedure is executed + periodically, say once every 100 milliseconds, to evaluate the + temporal value of outstanding objects on which the destination is + waiting. Those whose temporal value has expired (i.e. which are no + longer useful as defined by the application) are "declared lost" and + treated in much the same manner as delivered objects - the matrix is + updated, and if the object type is BART-L, an ACK is sent. Any + objects from the current period which have not yet been delivered or + declared lost are candidates for the "Terminator" as the procedure is + called. The Terminator's criterion is not specifically addressed in + this RFC, but one example might be for the receiving user to + periodically pass a list of no-longer-useful objects to TCP-B. + + Another question which arises is, "How does one calculate the send + and receive windows?" With a partial order service, these windows + are no longer contiguous intervals of objects but rather sets of + objects. In fact, there are three sets which are of interest to the + receiving TCP one of which has already been mentioned - the + Deliverable Set. Additionally, we can think of the Bufferable Set + and the Receivable Set. Some definitions are in order: + + Deliverable Set: objects which can be immediately passed up to + the user. + + Buffered Set: objects stored in a buffer awaiting delivery. + + Bufferable Set: objects which can be stored but not immediately + delivered (due to some ordering constraint). + + Receivable Set: union of the Deliverable Set and the Bufferable + Set (which are disjoint) - intuitively, all objects which + are "receivable" must be either "deliverable" or + "bufferable". + + + + + +Connolly, Amer & Conrad [Page 26] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + The following example will help to illustrate these sets. Consider + our simple service profile from earlier for the case where the size + of each object is 1 MByte and the receiver has only 2 MBytes of + buffer space (enough for 2 objects). Define a boolean vector of + length N (N = number of objects in a period) called the Processed + Vector which is used to indicate which objects from the current + period have been delivered or declared lost. Initially, all buffers + are empty and the PO Matrix and Processed Vector are as shown here, + + 1 2 3 4 5 6 + +-------------+ + 1 | - 1 0 0 0 1 | + 2 | - - 0 0 0 1 | + 3 | - - - 1 0 1 | + 4 | - - - - 0 1 | + 5 | - - - - - 1 | [ F F F F F F ] + 6 | - - - - - - | 1 2 3 4 5 6 + +-------------+ + + PO Matrix Processed Vector + + From the PO Matrix, it is clear that the Deliverable Set = + {(1,1),(1,3),(1,5)}, where (1,1) refers to object #1 from period #1, + asssuming that the current period is period #1. + + The Bufferable Set, however, depends upon how one defines bufferable + objects. Several approaches are possible. The authors' initial + approach to determining the Bufferable Set can best be explained in + terms of the following rules, + + Rule 1: Remaining space must be allocated for all objects from + period i before any object from period i+1 is buffered + + Rule 2: In the event that there exists enough space to buffer + some but not all objects from a given period, space will + be reserved for the first objects (i.e. 1,2,3,...,k) + + With these rules, the Bufferable Set = {(1,2),(1,4)}, the Buffered + Set is trivially equal to the empty set, { }, and the Receivable Set + = {(1,1),(1,2),(1,3),(1,4),(1,5)}. + + Note that the current acknowledgment scheme uses the min and max + values in the Receivable Set for its window advertisement which is + transmitted in all ACK segments sent along the receive-side of the + connection (from receiver to sender). Moreover, the + "piggyback_delay" timer is still used to couple ACKs with return data + (as utilized in standard TCP). + + + + +Connolly, Amer & Conrad [Page 27] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Returning to our example, let us now assume that object 1 and then 3 + arrive at the receiver and object 2 is lost. After processing both + objects, the PO Matrix and Processed Vector will have the following + updated structure, + + 1 2 3 4 5 6 + +-------------+ + 1 | - 0 0 0 0 0 | + 2 | - - 0 0 0 1 | + 3 | - - - 0 0 0 | + 4 | - - - - 0 1 | + 5 | - - - - - 1 | [ T F T F F F ] + 6 | - - - - - - | 1 2 3 4 5 6 + +-------------+ + + PO Matrix Processed Vector + + We can see that the Deliverable Set = {(1,2),(1,4),(1,5)}, but what + should the Bufferable Set consist of? Since only one buffer is + required for the current period's objects, we have 1 Mbyte of + additional space available for "future" objects and therefore include + the first object from period #2 in both the Bufferable and the + Receivable Set, + + Deliverable Set = {(1,2),(1,4),(1,5)} + + Bufferable Set = {(1,6),(2,1)} + + Buffered Set = { } + + Receivable Set = {(1,2),(1,4),(1,5),(1,6),(2,1)} + + In general, the notion of window management takes on new meaning with + a partial order service. One may re-examine the classic window + relations with a partial order service in mind and devise new, less + restrictive relations which may shed further light on the operation + of such a service. + + Two final details: (1) as with the sender, the receiver must + periodically establish or modify the PO service profile and (2) upon + processing the last object in a period, the receiver must re-set the + PO matrix and Processed vector to their initial states. + + + + + + + + + +Connolly, Amer & Conrad [Page 28] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Let us look at the state machine and pseudo-code for the receiver. + + (2)Data Segment Arrival (5)PO Profile fragment Arrival + +------+ +-------+ + | | | | + | V (1)First PO Profile | V + +---------+ fragment arrives +---------+(6) Data Segment + +---->| |----------------------->| |<-----+ Arrival + | | ESTAB | | PO |------+ + | | | | | + | | | | SETUP |<-----+ +(3) +-----| |<-----------------------| |------+ +Terminator+---------+ (9)PO Setup complete +---------+(7) Terminator + ^ | | ^ + | | | | + +------+ +------+ + (4)Piggyback Timeout (8)Piggyback Timeout + + + Event 1 - First PO Service Profile fragment arrives at network + ======= interface + Transition to the PO SETUP state + Store the PO service profile (fragment) + Send an Acknowledgement of the PO service profile (fragment) + + Event 2 - Data Segment Arrival + ======= + If object is in Deliverable Set then + Deliver the object + Update PO Matrix and Processed Vector + Check buffers for newly deliverable objects + If all objects from current period have been processed then + Start the next period (re-initialize data structures) + Start piggyback_delay timer to send an ACK + Else if object is in Bufferable Set then + Store the object + Else + Discard object + Start piggyback_delay timer to send an ACK + + Event 3 - Periodic call of the Terminator + ======= + For all unprocessed objects in the current period do + If object is "no longer useful" then + Update PO Matrix and Processed Vector + If object is in a buffer then + Release the buffer + Check buffers for newly deliverable objects + + + +Connolly, Amer & Conrad [Page 29] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + If all objects from current period have been processed + then Start the next period (re-initialize data + structures) + + Event 4 - Piggyback_delay Timer Expires + ======= + Send an ACK + Disable piggyback_delay timer + + Event 5 - PO Service Profile fragment arrives at network interface + ======= + Store the PO service profile (fragment) + Send an Acknowledgement of the PO service profile (fragment) + If entire PO Service profile has been received then enable Event + (9) + + Event 6 - Data Segment arrival + ======= + (See event 2) + + Event 7 - Periodic call of the terminator + ======= + (See Event 3) + + Event 8 - Piggyback_delay Timer Expires + ======= + (See Event 4) + + Event 9 - PO Setup Complete + ======= + Transition to the ESTAB state + + Note that, for reasons of clarity, we have used a transitively closed + matrix representation of the partial order. A more efficient + implementation based on an adjacency list representation of a + transitively reduced precedence graph results in a more efficient + running time [CAC93]. + +5. Quantifying and Comparing Partial Order Services + + While ordered, reliable delivery is ideal, the existence of less- + than-ideal underlying networks can cause delays for applications that + need only partial order or partial reliability. By introducing a + partial order service, one may in effect relax the requirements on + order and reliability and presumably expect some savings in terms of + buffer utilization and bandwidth (due to fewer retransmissions) and + shorter overall delays. A practical question to be addressed is, + "what are the expected savings likely to be?" + + + +Connolly, Amer & Conrad [Page 30] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + As mentioned in Section 2, the extent of such savings will depend + largely on the quality of the underlying network - bandwidth, delay, + amount and distribution of loss/duplication/disorder - as well as the + flexibility of the partial order itself - specified by the PO matrix + and reliability vector. If the underlying network has no loss, a + partial order service essentially becomes an ordered service. + Collecting experimental data to ascertain realistic network + conditions is a straightforward task and will help to quantify in + general the value of a partial order service [Bol93]. But how can + one quantify and compare the cost of providing specific levels of + service? + + Preliminary research indicates that the number of linear extensions + (orderings) of a partial order in the presence of loss effectively + measures the complexity of that order. The authors have derived + formulae for calculating the number of extensions when a partial + order is series-parallel and have proposed a metric for comparing + partial orders based on this number [ACCD93b]. This metric could be + used as a means for charging for the service, for example. What also + may be interesting is a specific head-to-head comparison between + different partial orders with varying degrees of flexibility. Work + is currently underway on a simulation model aimed at providing this + information. And finally, work is underway on an implementation of + TCP which includes partial order service. + +6. Future Direction + + In addition to the simulation and implementation work the authors are + pursuing several problems related to partial ordering which will be + mentioned briefly. + + An interesting question arises when discussing the acknowledgment + strategy for a partial order service. For classic protocols, a + cumulative ACK of object i confirms all objects "up to and including" + i. But the meaning of "up to and including" with a partial order + service has different implications than with an ordered service. + + Consider our example partial order, ((1;2)||(3;4)||5);6). What + should a cumulative ACK of object 4 confirm? The most logical + definition would say it confirms receipt of object 4 and all objects + that precede 4 in the partial order, in this case, object 3. Nothing + is said about the arrival of objects 1 or 2. With this alternative + interpretation where cumulative ACKs depend on the partial order, the + sender must examine the partial order matrix to determine which + buffers can be released. In this example, scanning column 4 of the + matrix reveals that object 3 must come before object 4 and therefore + both object buffers (and any buffers from a previous period) can be + released. + + + +Connolly, Amer & Conrad [Page 31] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + Other partial order acknowledgment policies are possible for a + protocol providing a partial order service including the use of + selective ACKs (which has been proposed in [JB88] and implemented in + the Cray TCP [Chang93]) as well as the current TCP strategy where an + ACK of i also ACKs everything <= i (in a cyclical sequence number + space). The authors are investigating an ACK policy which utilizes a + combination of selective and "partial-order-cumulative" + acknowledgments. This is accomplished by replacing the current TCP + cumulative ACK with one which has the partial order meaning as + described above and augmenting this with intermittent selective ACKs + when needed. + + In another area, the notion of fragmented delivery, mentioned in the + beginning of Section 4, looks like a promising technique for certain + classes of applications which may offer a substantial improvement in + memory utilization. Briefly, the term fragmented delivery refers to + the ability to transfer less-than-complete objects between the + transport layer and the user application (or session layer as the + case may be). For example, a 1Mbyte object could potentially be + delivered in multiple "chunks" as segments arrive thus freeing up + valuable memory and reducing the delay on those pieces of data. The + scenario becomes somewhat more complex when multiple "parallel + streams" are considered where the application could now receive + pieces of multiple objects associated with different streams. + + Additional work in the area of implementing a working partial order + protocol is being performed both at the University of Delaware and at + the LAAS du CNRS laboratory in Toulouse, France - particularly in + support of distributed, high-speed, multimedia communication. It will + be interesting to examine the processing requirements for an + implementation of a partial order protocol at key events (such as + object arrival) compared with a non-partial order implementation. + + Finally, the authors are interested in the realization of a network + application utilizing a partial order service. The aim of such work + is threefold: (1) provide further insight into the expected + performance gains, (2) identify new issues unique to partial order + transport and, (3) build a road-map for application designers + interested in using a partial order service. + +7. Summary + + This RFC introduces the concepts of a partial order service and + discusses the practical issues involved with including partial + ordering in a transport protocol. The need for such a service is + motivated by several applications including the vast fields of + distributed databases, and multimedia. The service has been + presented as a backward-compatible extension to TCP to adapt to + + + +Connolly, Amer & Conrad [Page 32] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + applications with different needs specified in terms of QOS + parameters. + + The notion of a partial ordering extends QOS flexibility to include + object delivery, reliability, and temporal value thus allowing the + transport layer to effectively handle a wider range of applications + (i.e., any which might benefit from such mechanisms). The service + profile described in Section 4 accurately characterizes the QOS for a + partial order service (which encompasses the two extremes of total + ordered and unordered transport as well). + + Several significant modifications have been proposed and are + summarized here: + + (1) Replacing the requirement for ordered delivery with one for + application-dependent partial ordering + + (2) Allowing unreliable and partially reliable data transport + + (3) Conducting a non-symmetrical connection (not entirely foreign + to TCP, the use of different MSS values for the two sides + of a connection is an example) + + (4) Management of "objects" rather than octets + + (5) Modified acknowledgment strategy + + (6) New definition for the send and receive "windows" + + (7) Extension of the User/TCP interface to include certain + QOS parameters + + (8) Use of new TCP options + + As evidenced by this list, a partial order and partial reliability + service proposes to re-examine several fundamental transport + mechanisms and, in so doing, offers the opportunity for substantial + improvement in the support of existing and new application areas. + + + + + + + + + + + + + +Connolly, Amer & Conrad [Page 33] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + +8. References + + [ACCD93a] Amer, P., Chassot, C., Connolly, T., and M. Diaz, + "Partial Order Transport Service for Multimedia + Applications: Reliable Service", Second International + Symposium on High Performance Distributed Computing + (HPDC-2), Spokane, Washington, July 1993. + + [ACCD93b] Amer, P., Chassot, C., Connolly, T., and M. Diaz, + "Partial Order Transport Service for Multimedia + Applications: Unreliable Service", Proc. INET '93, San + Francisco, August 1993. + + [AH91] Anderson, D., and G. Homsy, "A Continuous Media I/O + Server and its Synchronization Mechanism", IEEE + Computer, 24(10), 51-57, October 1991. + + [AS93] Agrawala, A., and D. Sanghi, "Experimental Assessment + of End-to-End Behavior on Internet," Proc. IEEE INFOCOM + '93, San Francisco, CA, March 1993. + + [BCP93] Claffy, K., Polyzos, G., and H.-W. Braun, "Traffic + Characteristics of the T1 NSFNET", Proc. IEEE INFOCOM + '93, San Francisco, CA, March 1993. + + [Bol93] Bolot, J., "End-to-End Packet Delay and Loss Behavior + in the Internet", SIGCOMM '93, Ithaca, NY, September + 1993. + + [CAC93] Conrad, P., Amer, P., and T. Connolly, "Improving + Performance in Transport-Layer Communications Protocols + by using Partial Orders and Partial Reliability", + Work in Progress, December 1993. + + [Chang93] Chang, Y., "High-Speed Transport Protocol Evaluation -- + the Final Report", MCNC Center for Communications + Technical Document, February 1993. + + [Dee89] Deering, S., "Host Extensions for IP Multicasting," STD + 5, RFC 1112 Stanford University, August 1989. + + [DS93] Diaz, M., and P. Senac, "Time Stream Petri Nets: A + Model for Multimedia Synchronization", Proceedings of + Multimedia Modeling '93, Singapore, 1993. + + + + + + + +Connolly, Amer & Conrad [Page 34] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + + [HKN91] Hardt-Kornacki, S., and L. Ness, "Optimization Model + for the Delivery of Interactive Multimedia Documents", + In Proc. Globecom '91, 669-673, Phoenix, Arizona, + December 1991. + + [JB88] Jacobson, V., and R. Braden, "TCP Extensions for + Long-Delay Paths", RFC 1072, LBL, USC/Information + Sciences Institute, October 1988. + + [JBB92] Jacobson, V., Braden, R., and D. Borman, "TCP + Extensions for High Performance", RFC 1323, LBL, Cray + Research, USC/Information Sciences Institute, May 1992. + + [LMKQ89] Leffler, S., McKusick, M., Karels, M., and J. + Quarterman, "4.3 BSD UNIX Operating System", + Addison-Wesley Publishing Company, Reading, MA, 1989. + + [OP91] O'Malley, S., and L. Peterson, "TCP Extensions + Considered Harmful", RFC 1263, University of Arizona, + October 1991. + + [Pos81] Postel, J., "Transmission Control Protocol - DARPA + Internet Program Protocol Specification," STD 7, + RFC 793, DARPA, September 1981. + +Security Considerations + + Security issues are not discussed in this memo. + + + + + + + + + + + + + + + + + + + + + + + +Connolly, Amer & Conrad [Page 35] + +RFC 1693 An Extension to TCP: Partial Order Service November 1994 + + +Authors' Addresses + + Tom Connolly + 101C Smith Hall + Department of Computer & Information Sciences + University of Delaware + Newark, DE 19716 - 2586 + + EMail: connolly@udel.edu + + + Paul D. Amer + 101C Smith Hall + Department of Computer & Information Sciences + University of Delaware + Newark, DE 19716 - 2586 + + EMail: amer@udel.edu + + + Phill Conrad + 101C Smith Hall + Department of Computer & Information Sciences + University of Delaware + Newark, DE 19716 - 2586 + + EMail: pconrad@udel.edu + + + + + + + + + + + + + + + + + + + + + + + + +Connolly, Amer & Conrad [Page 36] + diff --git a/ext/picotcp/RFC/rfc2026.txt b/ext/picotcp/RFC/rfc2026.txt new file mode 100644 index 0000000..1c9c59a --- /dev/null +++ b/ext/picotcp/RFC/rfc2026.txt @@ -0,0 +1,2019 @@ + + + + + + +Network Working Group S. Bradner +Request for Comments: 2026 Harvard University +BCP: 9 October 1996 +Obsoletes: 1602 +Category: Best Current Practice + + + The Internet Standards Process -- Revision 3 + + +Status of this Memo + + This document specifies an Internet Best Current Practices for the + Internet Community, and requests discussion and suggestions for + improvements. Distribution of this memo is unlimited. + +Abstract + + This memo documents the process used by the Internet community for + the standardization of protocols and procedures. It defines the + stages in the standardization process, the requirements for moving a + document between stages and the types of documents used during this + process. It also addresses the intellectual property rights and + copyright issues associated with the standards process. + +Table of Contents + + 1. INTRODUCTION....................................................2 + 1.1 Internet Standards...........................................3 + 1.2 The Internet Standards Process...............................3 + 1.3 Organization of This Document................................5 + 2. INTERNET STANDARDS-RELATED PUBLICATIONS.........................5 + 2.1 Requests for Comments (RFCs).................................5 + 2.2 Internet-Drafts..............................................7 + 3. INTERNET STANDARD SPECIFICATIONS................................8 + 3.1 Technical Specification (TS).................................8 + 3.2 Applicability Statement (AS).................................8 + 3.3 Requirement Levels...........................................9 + 4. THE INTERNET STANDARDS TRACK...................................10 + 4.1 Standards Track Maturity Levels.............................11 + 4.1.1 Proposed Standard.......................................11 + 4.1.2 Draft Standard..........................................12 + 4.1.3 Internet Standard.......................................13 + 4.2 Non-Standards Track Maturity Levels.........................13 + 4.2.1 Experimental............................................13 + 4.2.2 Informational...........................................14 + 4.2.3 Procedures for Experimental and Informational RFCs......14 + 4.2.4 Historic................................................15 + + + +Bradner Best Current Practice [Page 1] + +RFC 2026 Internet Standards Process October 1996 + + + 5. Best Current Practice (BCP) RFCs...............................15 + 5.1 BCP Review Process..........................................16 + 6. THE INTERNET STANDARDS PROCESS.................................17 + 6.1 Standards Actions...........................................17 + 6.1.1 Initiation of Action....................................17 + 6.1.2 IESG Review and Approval................................17 + 6.1.3 Publication.............................................18 + 6.2 Advancing in the Standards Track............................19 + 6.3 Revising a Standard.........................................20 + 6.4 Retiring a Standard.........................................20 + 6.5 Conflict Resolution and Appeals.............................21 + 6.5.1 Working Group Disputes...................................21 + 6.5.2 Process Failures.........................................22 + 6.5.3 Questions of Applicable Procedure........................22 + 6.5.4 Appeals Procedure........................................23 + 7. EXTERNAL STANDARDS AND SPECIFICATIONS..........................23 + 7.1 Use of External Specifications..............................24 + 7.1.1 Incorporation of an Open Standard.......................24 + 7.1.2 Incorporation of a Other Specifications.................24 + 7.1.3 Assumption..............................................25 + 8. NOTICES AND RECORD KEEPING......................................25 + 9. VARYING THE PROCESS.............................................26 + 9.1 The Variance Procedure.......................................26 + 9.2 Exclusions...................................................27 + 10. INTELLECTUAL PROPERTY RIGHTS..................................27 + 10.1. General Policy............................................27 + 10.2 Confidentiality Obligations...............................28 + 10.3. Rights and Permissions....................................28 + 10.3.1. All Contributions......................................28 + 10.3.2. Standards Track Documents..............................29 + 10.3.3 Determination of Reasonable and + Non-discriminatory Terms................................30 + 10.4. Notices...................................................30 + 11. ACKNOWLEDGMENTS................................................32 + 12. SECURITY CONSIDERATIONS........................................32 + 13. REFERENCES.....................................................33 + 14. DEFINITIONS OF TERMS...........................................33 + 15. AUTHOR'S ADDRESS...............................................34 + APPENDIX A: GLOSSARY OF ACRONYMS...................................35 + + + + + + + + + + + + +Bradner Best Current Practice [Page 2] + +RFC 2026 Internet Standards Process October 1996 + + +1. INTRODUCTION + + This memo documents the process currently used by the Internet + community for the standardization of protocols and procedures. The + Internet Standards process is an activity of the Internet Society + that is organized and managed on behalf of the Internet community by + the Internet Architecture Board (IAB) and the Internet Engineering + Steering Group (IESG). + +1.1 Internet Standards + + The Internet, a loosely-organized international collaboration of + autonomous, interconnected networks, supports host-to-host + communication through voluntary adherence to open protocols and + procedures defined by Internet Standards. There are also many + isolated interconnected networks, which are not connected to the + global Internet but use the Internet Standards. + + The Internet Standards Process described in this document is + concerned with all protocols, procedures, and conventions that are + used in or by the Internet, whether or not they are part of the + TCP/IP protocol suite. In the case of protocols developed and/or + standardized by non-Internet organizations, however, the Internet + Standards Process normally applies to the application of the protocol + or procedure in the Internet context, not to the specification of the + protocol itself. + + In general, an Internet Standard is a specification that is stable + and well-understood, is technically competent, has multiple, + independent, and interoperable implementations with substantial + operational experience, enjoys significant public support, and is + recognizably useful in some or all parts of the Internet. + +1.2 The Internet Standards Process + + In outline, the process of creating an Internet Standard is + straightforward: a specification undergoes a period of development + and several iterations of review by the Internet community and + revision based upon experience, is adopted as a Standard by the + appropriate body (see below), and is published. In practice, the + process is more complicated, due to (1) the difficulty of creating + specifications of high technical quality; (2) the need to consider + the interests of all of the affected parties; (3) the importance of + establishing widespread community consensus; and (4) the difficulty + of evaluating the utility of a particular specification for the + Internet community. + + + + + +Bradner Best Current Practice [Page 3] + +RFC 2026 Internet Standards Process October 1996 + + + The goals of the Internet Standards Process are: + o technical excellence; + o prior implementation and testing; + o clear, concise, and easily understood documentation; + o openness and fairness; and + o timeliness. + + The procedures described in this document are designed to be fair, + open, and objective; to reflect existing (proven) practice; and to + be flexible. + + o These procedures are intended to provide a fair, open, and + objective basis for developing, evaluating, and adopting Internet + Standards. They provide ample opportunity for participation and + comment by all interested parties. At each stage of the + standardization process, a specification is repeatedly discussed + and its merits debated in open meetings and/or public electronic + mailing lists, and it is made available for review via world-wide + on-line directories. + + o These procedures are explicitly aimed at recognizing and adopting + generally-accepted practices. Thus, a candidate specification + must be implemented and tested for correct operation and + interoperability by multiple independent parties and utilized in + increasingly demanding environments, before it can be adopted as + an Internet Standard. + + o These procedures provide a great deal of flexibility to adapt to + the wide variety of circumstances that occur in the + standardization process. Experience has shown this flexibility to + be vital in achieving the goals listed above. + + The goal of technical competence, the requirement for prior + implementation and testing, and the need to allow all interested + parties to comment all require significant time and effort. On the + other hand, today's rapid development of networking technology + demands timely development of standards. The Internet Standards + Process is intended to balance these conflicting goals. The process + is believed to be as short and simple as possible without sacrificing + technical excellence, thorough testing before adoption of a standard, + or openness and fairness. + + From its inception, the Internet has been, and is expected to remain, + an evolving system whose participants regularly factor new + requirements and technology into its design and implementation. Users + of the Internet and providers of the equipment, software, and + services that support it should anticipate and embrace this evolution + as a major tenet of Internet philosophy. + + + +Bradner Best Current Practice [Page 4] + +RFC 2026 Internet Standards Process October 1996 + + + The procedures described in this document are the result of a number + of years of evolution, driven both by the needs of the growing and + increasingly diverse Internet community, and by experience. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Bradner Best Current Practice [Page 5] + +RFC 2026 Internet Standards Process October 1996 + + +1.3 Organization of This Document + + Section 2 describes the publications and archives of the Internet + Standards Process. Section 3 describes the types of Internet + standard specifications. Section 4 describes the Internet standards + specifications track. Section 5 describes Best Current Practice + RFCs. Section 6 describes the process and rules for Internet + standardization. Section 7 specifies the way in which externally- + sponsored specifications and practices, developed and controlled by + other standards bodies or by others, are handled within the Internet + Standards Process. Section 8 describes the requirements for notices + and record keeping Section 9 defines a variance process to allow + one-time exceptions to some of the requirements in this document + Section 10 presents the rules that are required to protect + intellectual property rights in the context of the development and + use of Internet Standards. Section 11 includes acknowledgments of + some of the people involved in creation of this document. Section 12 + notes that security issues are not dealt with by this document. + Section 13 contains a list of numbered references. Section 14 + contains definitions of some of the terms used in this document. + Section 15 lists the author's email and postal addresses. Appendix A + contains a list of frequently-used acronyms. + +2. INTERNET STANDARDS-RELATED PUBLICATIONS + +2.1 Requests for Comments (RFCs) + + Each distinct version of an Internet standards-related specification + is published as part of the "Request for Comments" (RFC) document + series. This archival series is the official publication channel for + Internet standards documents and other publications of the IESG, IAB, + and Internet community. RFCs can be obtained from a number of + Internet hosts using anonymous FTP, gopher, World Wide Web, and other + Internet document-retrieval systems. + + The RFC series of documents on networking began in 1969 as part of + the original ARPA wide-area networking (ARPANET) project (see + Appendix A for glossary of acronyms). RFCs cover a wide range of + topics in addition to Internet Standards, from early discussion of + new research concepts to status memos about the Internet. RFC + publication is the direct responsibility of the RFC Editor, under the + general direction of the IAB. + + + + + + + + + +Bradner Best Current Practice [Page 6] + +RFC 2026 Internet Standards Process October 1996 + + + The rules for formatting and submitting an RFC are defined in [5]. + Every RFC is available in ASCII text. Some RFCs are also available + in other formats. The other versions of an RFC may contain material + (such as diagrams and figures) that is not present in the ASCII + version, and it may be formatted differently. + + ********************************************************* + * * + * A stricter requirement applies to standards-track * + * specifications: the ASCII text version is the * + * definitive reference, and therefore it must be a * + * complete and accurate specification of the standard, * + * including all necessary diagrams and illustrations. * + * * + ********************************************************* + + The status of Internet protocol and service specifications is + summarized periodically in an RFC entitled "Internet Official + Protocol Standards" [1]. This RFC shows the level of maturity and + other helpful information for each Internet protocol or service + specification (see section 3). + + Some RFCs document Internet Standards. These RFCs form the 'STD' + subseries of the RFC series [4]. When a specification has been + adopted as an Internet Standard, it is given the additional label + "STDxxx", but it keeps its RFC number and its place in the RFC + series. (see section 4.1.3) + + Some RFCs standardize the results of community deliberations about + statements of principle or conclusions about what is the best way to + perform some operations or IETF process function. These RFCs form + the specification has been adopted as a BCP, it is given the + additional label "BCPxxx", but it keeps its RFC number and its place + in the RFC series. (see section 5) + + Not all specifications of protocols or services for the Internet + should or will become Internet Standards or BCPs. Such non-standards + track specifications are not subject to the rules for Internet + standardization. Non-standards track specifications may be published + directly as "Experimental" or "Informational" RFCs at the discretion + of the RFC Editor in consultation with the IESG (see section 4.2). + + + + + + + + + + +Bradner Best Current Practice [Page 7] + +RFC 2026 Internet Standards Process October 1996 + + + ******************************************************** + * * + * It is important to remember that not all RFCs * + * are standards track documents, and that not all * + * standards track documents reach the level of * + * Internet Standard. In the same way, not all RFCs * + * which describe current practices have been given * + * the review and approval to become BCPs. See * + * RFC-1796 [6] for further information. * + * * + ******************************************************** + +2.2 Internet-Drafts + + During the development of a specification, draft versions of the + document are made available for informal review and comment by + placing them in the IETF's "Internet-Drafts" directory, which is + replicated on a number of Internet hosts. This makes an evolving + working document readily available to a wide audience, facilitating + the process of review and revision. + + An Internet-Draft that is published as an RFC, or that has remained + unchanged in the Internet-Drafts directory for more than six months + without being recommended by the IESG for publication as an RFC, is + simply removed from the Internet-Drafts directory. At any time, an + Internet-Draft may be replaced by a more recent version of the same + specification, restarting the six-month timeout period. + + An Internet-Draft is NOT a means of "publishing" a specification; + specifications are published through the RFC mechanism described in + the previous section. Internet-Drafts have no formal status, and are + subject to change or removal at any time. + + ******************************************************** + * * + * Under no circumstances should an Internet-Draft * + * be referenced by any paper, report, or Request- * + * for-Proposal, nor should a vendor claim compliance * + * with an Internet-Draft. * + * * + ******************************************************** + + + + + + + + + + +Bradner Best Current Practice [Page 8] + +RFC 2026 Internet Standards Process October 1996 + + + Note: It is acceptable to reference a standards-track specification + that may reasonably be expected to be published as an RFC using the + phrase "Work in Progress" without referencing an Internet-Draft. + This may also be done in a standards track document itself as long + as the specification in which the reference is made would stand as a + complete and understandable document with or without the reference to + the "Work in Progress". + +3. INTERNET STANDARD SPECIFICATIONS + + Specifications subject to the Internet Standards Process fall into + one of two categories: Technical Specification (TS) and + Applicability Statement (AS). + +3.1 Technical Specification (TS) + + A Technical Specification is any description of a protocol, service, + procedure, convention, or format. It may completely describe all of + the relevant aspects of its subject, or it may leave one or more + parameters or options unspecified. A TS may be completely self- + contained, or it may incorporate material from other specifications + by reference to other documents (which might or might not be Internet + Standards). + + A TS shall include a statement of its scope and the general intent + for its use (domain of applicability). Thus, a TS that is inherently + specific to a particular context shall contain a statement to that + effect. However, a TS does not specify requirements for its use + within the Internet; these requirements, which depend on the + particular context in which the TS is incorporated by different + system configurations, are defined by an Applicability Statement. + +3.2 Applicability Statement (AS) + + An Applicability Statement specifies how, and under what + circumstances, one or more TSs may be applied to support a particular + Internet capability. An AS may specify uses for TSs that are not + Internet Standards, as discussed in Section 7. + + An AS identifies the relevant TSs and the specific way in which they + are to be combined, and may also specify particular values or ranges + of TS parameters or subfunctions of a TS protocol that must be + implemented. An AS also specifies the circumstances in which the use + of a particular TS is required, recommended, or elective (see section + 3.3). + + + + + + +Bradner Best Current Practice [Page 9] + +RFC 2026 Internet Standards Process October 1996 + + + An AS may describe particular methods of using a TS in a restricted + "domain of applicability", such as Internet routers, terminal + servers, Internet systems that interface to Ethernets, or datagram- + based database servers. + + The broadest type of AS is a comprehensive conformance specification, + commonly called a "requirements document", for a particular class of + Internet systems, such as Internet routers or Internet hosts. + + An AS may not have a higher maturity level in the standards track + than any standards-track TS on which the AS relies (see section 4.1). + For example, a TS at Draft Standard level may be referenced by an AS + at the Proposed Standard or Draft Standard level, but not by an AS at + the Standard level. + +3.3 Requirement Levels + + An AS shall apply one of the following "requirement levels" to each + of the TSs to which it refers: + + (a) Required: Implementation of the referenced TS, as specified by + the AS, is required to achieve minimal conformance. For example, + IP and ICMP must be implemented by all Internet systems using the + TCP/IP Protocol Suite. + + (b) Recommended: Implementation of the referenced TS is not + required for minimal conformance, but experience and/or generally + accepted technical wisdom suggest its desirability in the domain + of applicability of the AS. Vendors are strongly encouraged to + include the functions, features, and protocols of Recommended TSs + in their products, and should omit them only if the omission is + justified by some special circumstance. For example, the TELNET + protocol should be implemented by all systems that would benefit + from remote access. + + (c) Elective: Implementation of the referenced TS is optional + within the domain of applicability of the AS; that is, the AS + creates no explicit necessity to apply the TS. However, a + particular vendor may decide to implement it, or a particular user + may decide that it is a necessity in a specific environment. For + example, the DECNET MIB could be seen as valuable in an + environment where the DECNET protocol is used. + + + + + + + + + +Bradner Best Current Practice [Page 10] + +RFC 2026 Internet Standards Process October 1996 + + + As noted in section 4.1, there are TSs that are not in the + standards track or that have been retired from the standards + track, and are therefore not required, recommended, or elective. + Two additional "requirement level" designations are available for + these TSs: + + (d) Limited Use: The TS is considered to be appropriate for use + only in limited or unique circumstances. For example, the usage + of a protocol with the "Experimental" designation should generally + be limited to those actively involved with the experiment. + + (e) Not Recommended: A TS that is considered to be inappropriate + for general use is labeled "Not Recommended". This may be because + of its limited functionality, specialized nature, or historic + status. + + Although TSs and ASs are conceptually separate, in practice a + standards-track document may combine an AS and one or more related + TSs. For example, Technical Specifications that are developed + specifically and exclusively for some particular domain of + applicability, e.g., for mail server hosts, often contain within a + single specification all of the relevant AS and TS information. In + such cases, no useful purpose would be served by deliberately + distributing the information among several documents just to preserve + the formal AS/TS distinction. However, a TS that is likely to apply + to more than one domain of applicability should be developed in a + modular fashion, to facilitate its incorporation by multiple ASs. + + The "Official Protocol Standards" RFC (STD1) lists a general + requirement level for each TS, using the nomenclature defined in this + section. This RFC is updated periodically. In many cases, more + detailed descriptions of the requirement levels of particular + protocols and of individual features of the protocols will be found + in appropriate ASs. + +4. THE INTERNET STANDARDS TRACK + + Specifications that are intended to become Internet Standards evolve + through a set of maturity levels known as the "standards track". + These maturity levels -- "Proposed Standard", "Draft Standard", and + "Standard" -- are defined and discussed in section 4.1. The way in + which specifications move along the standards track is described in + section 6. + + Even after a specification has been adopted as an Internet Standard, + further evolution often occurs based on experience and the + recognition of new requirements. The nomenclature and procedures of + Internet standardization provide for the replacement of old Internet + + + +Bradner Best Current Practice [Page 11] + +RFC 2026 Internet Standards Process October 1996 + + + Standards with new ones, and the assignment of descriptive labels to + indicate the status of "retired" Internet Standards. A set of + maturity levels is defined in section 4.2 to cover these and other + specifications that are not considered to be on the standards track. + +4.1 Standards Track Maturity Levels + + Internet specifications go through stages of development, testing, + and acceptance. Within the Internet Standards Process, these stages + are formally labeled "maturity levels". + + This section describes the maturity levels and the expected + characteristics of specifications at each level. + +4.1.1 Proposed Standard + + The entry-level maturity for the standards track is "Proposed + Standard". A specific action by the IESG is required to move a + specification onto the standards track at the "Proposed Standard" + level. + + A Proposed Standard specification is generally stable, has resolved + known design choices, is believed to be well-understood, has received + significant community review, and appears to enjoy enough community + interest to be considered valuable. However, further experience + might result in a change or even retraction of the specification + before it advances. + + Usually, neither implementation nor operational experience is + required for the designation of a specification as a Proposed + Standard. However, such experience is highly desirable, and will + usually represent a strong argument in favor of a Proposed Standard + designation. + + The IESG may require implementation and/or operational experience + prior to granting Proposed Standard status to a specification that + materially affects the core Internet protocols or that specifies + behavior that may have significant operational impact on the + Internet. + + A Proposed Standard should have no known technical omissions with + respect to the requirements placed upon it. However, the IESG may + waive this requirement in order to allow a specification to advance + to the Proposed Standard state when it is considered to be useful and + necessary (and timely) even with known technical omissions. + + + + + + +Bradner Best Current Practice [Page 12] + +RFC 2026 Internet Standards Process October 1996 + + + Implementors should treat Proposed Standards as immature + specifications. It is desirable to implement them in order to gain + experience and to validate, test, and clarify the specification. + However, since the content of Proposed Standards may be changed if + problems are found or better solutions are identified, deploying + implementations of such standards into a disruption-sensitive + environment is not recommended. + +4.1.2 Draft Standard + + A specification from which at least two independent and interoperable + implementations from different code bases have been developed, and + for which sufficient successful operational experience has been + obtained, may be elevated to the "Draft Standard" level. For the + purposes of this section, "interoperable" means to be functionally + equivalent or interchangeable components of the system or process in + which they are used. If patented or otherwise controlled technology + is required for implementation, the separate implementations must + also have resulted from separate exercise of the licensing process. + Elevation to Draft Standard is a major advance in status, indicating + a strong belief that the specification is mature and will be useful. + + The requirement for at least two independent and interoperable + implementations applies to all of the options and features of the + specification. In cases in which one or more options or features + have not been demonstrated in at least two interoperable + implementations, the specification may advance to the Draft Standard + level only if those options or features are removed. + + The Working Group chair is responsible for documenting the specific + implementations which qualify the specification for Draft or Internet + Standard status along with documentation about testing of the + interoperation of these implementations. The documentation must + include information about the support of each of the individual + options and features. This documentation should be submitted to the + Area Director with the protocol action request. (see Section 6) + + A Draft Standard must be well-understood and known to be quite + stable, both in its semantics and as a basis for developing an + implementation. A Draft Standard may still require additional or + more widespread field experience, since it is possible for + implementations based on Draft Standard specifications to demonstrate + unforeseen behavior when subjected to large-scale use in production + environments. + + + + + + + +Bradner Best Current Practice [Page 13] + +RFC 2026 Internet Standards Process October 1996 + + + A Draft Standard is normally considered to be a final specification, + and changes are likely to be made only to solve specific problems + encountered. In most circumstances, it is reasonable for vendors to + deploy implementations of Draft Standards into a disruption sensitive + environment. + +4.1.3 Internet Standard + + A specification for which significant implementation and successful + operational experience has been obtained may be elevated to the + Internet Standard level. An Internet Standard (which may simply be + referred to as a Standard) is characterized by a high degree of + technical maturity and by a generally held belief that the specified + protocol or service provides significant benefit to the Internet + community. + + A specification that reaches the status of Standard is assigned a + number in the STD series while retaining its RFC number. + +4.2 Non-Standards Track Maturity Levels + + Not every specification is on the standards track. A specification + may not be intended to be an Internet Standard, or it may be intended + for eventual standardization but not yet ready to enter the standards + track. A specification may have been superseded by a more recent + Internet Standard, or have otherwise fallen into disuse or disfavor. + + Specifications that are not on the standards track are labeled with + one of three "off-track" maturity levels: "Experimental", + "Informational", or "Historic". The documents bearing these labels + are not Internet Standards in any sense. + +4.2.1 Experimental + + The "Experimental" designation typically denotes a specification that + is part of some research or development effort. Such a specification + is published for the general information of the Internet technical + community and as an archival record of the work, subject only to + editorial considerations and to verification that there has been + adequate coordination with the standards process (see below). An + Experimental specification may be the output of an organized Internet + research effort (e.g., a Research Group of the IRTF), an IETF Working + Group, or it may be an individual contribution. + + + + + + + + +Bradner Best Current Practice [Page 14] + +RFC 2026 Internet Standards Process October 1996 + + +4.2.2 Informational + + An "Informational" specification is published for the general + information of the Internet community, and does not represent an + Internet community consensus or recommendation. The Informational + designation is intended to provide for the timely publication of a + very broad range of responsible informational documents from many + sources, subject only to editorial considerations and to verification + that there has been adequate coordination with the standards process + (see section 4.2.3). + + Specifications that have been prepared outside of the Internet + community and are not incorporated into the Internet Standards + Process by any of the provisions of section 10 may be published as + Informational RFCs, with the permission of the owner and the + concurrence of the RFC Editor. + +4.2.3 Procedures for Experimental and Informational RFCs + + Unless they are the result of IETF Working Group action, documents + intended to be published with Experimental or Informational status + should be submitted directly to the RFC Editor. The RFC Editor will + publish any such documents as Internet-Drafts which have not already + been so published. In order to differentiate these Internet-Drafts + they will be labeled or grouped in the I-D directory so they are + easily recognizable. The RFC Editor will wait two weeks after this + publication for comments before proceeding further. The RFC Editor + is expected to exercise his or her judgment concerning the editorial + suitability of a document for publication with Experimental or + Informational status, and may refuse to publish a document which, in + the expert opinion of the RFC Editor, is unrelated to Internet + activity or falls below the technical and/or editorial standard for + RFCs. + + To ensure that the non-standards track Experimental and Informational + designations are not misused to circumvent the Internet Standards + Process, the IESG and the RFC Editor have agreed that the RFC Editor + will refer to the IESG any document submitted for Experimental or + Informational publication which, in the opinion of the RFC Editor, + may be related to work being done, or expected to be done, within the + IETF community. The IESG shall review such a referred document + within a reasonable period of time, and recommend either that it be + published as originally submitted or referred to the IETF as a + contribution to the Internet Standards Process. + + If (a) the IESG recommends that the document be brought within the + IETF and progressed within the IETF context, but the author declines + to do so, or (b) the IESG considers that the document proposes + + + +Bradner Best Current Practice [Page 15] + +RFC 2026 Internet Standards Process October 1996 + + + something that conflicts with, or is actually inimical to, an + established IETF effort, the document may still be published as an + Experimental or Informational RFC. In these cases, however, the IESG + may insert appropriate "disclaimer" text into the RFC either in or + immediately following the "Status of this Memo" section in order to + make the circumstances of its publication clear to readers. + + Documents proposed for Experimental and Informational RFCs by IETF + Working Groups go through IESG review. The review is initiated using + the process described in section 6.1.1. + +4.2.4 Historic + + A specification that has been superseded by a more recent + specification or is for any other reason considered to be obsolete is + assigned to the "Historic" level. (Purists have suggested that the + word should be "Historical"; however, at this point the use of + "Historic" is historical.) + + Note: Standards track specifications normally must not depend on + other standards track specifications which are at a lower maturity + level or on non standards track specifications other than referenced + specifications from other standards bodies. (See Section 7.) + +5. BEST CURRENT PRACTICE (BCP) RFCs + + The BCP subseries of the RFC series is designed to be a way to + standardize practices and the results of community deliberations. A + BCP document is subject to the same basic set of procedures as + standards track documents and thus is a vehicle by which the IETF + community can define and ratify the community's best current thinking + on a statement of principle or on what is believed to be the best way + to perform some operations or IETF process function. + + Historically Internet standards have generally been concerned with + the technical specifications for hardware and software required for + computer communication across interconnected networks. However, + since the Internet itself is composed of networks operated by a great + variety of organizations, with diverse goals and rules, good user + service requires that the operators and administrators of the + Internet follow some common guidelines for policies and operations. + While these guidelines are generally different in scope and style + from protocol standards, their establishment needs a similar process + for consensus building. + + While it is recognized that entities such as the IAB and IESG are + composed of individuals who may participate, as individuals, in the + technical work of the IETF, it is also recognized that the entities + + + +Bradner Best Current Practice [Page 16] + +RFC 2026 Internet Standards Process October 1996 + + + themselves have an existence as leaders in the community. As leaders + in the Internet technical community, these entities should have an + outlet to propose ideas to stimulate work in a particular area, to + raise the community's sensitivity to a certain issue, to make a + statement of architectural principle, or to communicate their + thoughts on other matters. The BCP subseries creates a smoothly + structured way for these management entities to insert proposals into + the consensus-building machinery of the IETF while gauging the + community's view of that issue. + + Finally, the BCP series may be used to document the operation of the + IETF itself. For example, this document defines the IETF Standards + Process and is published as a BCP. + +5.1 BCP Review Process + + Unlike standards-track documents, the mechanisms described in BCPs + are not well suited to the phased roll-in nature of the three stage + standards track and instead generally only make sense for full and + immediate instantiation. + + The BCP process is similar to that for proposed standards. The BCP + is submitted to the IESG for review, (see section 6.1.1) and the + existing review process applies, including a Last-Call on the IETF + Announce mailing list. However, once the IESG has approved the + document, the process ends and the document is published. The + resulting document is viewed as having the technical approval of the + IETF. + + Specifically, a document to be considered for the status of BCP must + undergo the procedures outlined in sections 6.1, and 6.4 of this + document. The BCP process may be appealed according to the procedures + in section 6.5. + + Because BCPs are meant to express community consensus but are arrived + at more quickly than standards, BCPs require particular care. + Specifically, BCPs should not be viewed simply as stronger + Informational RFCs, but rather should be viewed as documents suitable + for a content different from Informational RFCs. + + A specification, or group of specifications, that has, or have been + approved as a BCP is assigned a number in the BCP series while + retaining its RFC number(s). + + + + + + + + +Bradner Best Current Practice [Page 17] + +RFC 2026 Internet Standards Process October 1996 + + +6. THE INTERNET STANDARDS PROCESS + + The mechanics of the Internet Standards Process involve decisions of + the IESG concerning the elevation of a specification onto the + standards track or the movement of a standards-track specification + from one maturity level to another. Although a number of reasonably + objective criteria (described below and in section 4) are available + to guide the IESG in making a decision to move a specification onto, + along, or off the standards track, there is no algorithmic guarantee + of elevation to or progression along the standards track for any + specification. The experienced collective judgment of the IESG + concerning the technical quality of a specification proposed for + elevation to or advancement in the standards track is an essential + component of the decision-making process. + +6.1 Standards Actions + + A "standards action" -- entering a particular specification into, + advancing it within, or removing it from, the standards track -- must + be approved by the IESG. + +6.1.1 Initiation of Action + + A specification that is intended to enter or advance in the Internet + standards track shall first be posted as an Internet-Draft (see + section 2.2) unless it has not changed since publication as an RFC. + It shall remain as an Internet-Draft for a period of time, not less + than two weeks, that permits useful community review, after which a + recommendation for action may be initiated. + + A standards action is initiated by a recommendation by the IETF + Working group responsible for a specification to its Area Director, + copied to the IETF Secretariat or, in the case of a specification not + associated with a Working Group, a recommendation by an individual to + the IESG. + +6.1.2 IESG Review and Approval + + The IESG shall determine whether or not a specification submitted to + it according to section 6.1.1 satisfies the applicable criteria for + the recommended action (see sections 4.1 and 4.2), and shall in + addition determine whether or not the technical quality and clarity + of the specification is consistent with that expected for the + maturity level to which the specification is recommended. + + In order to obtain all of the information necessary to make these + determinations, particularly when the specification is considered by + the IESG to be extremely important in terms of its potential impact + + + +Bradner Best Current Practice [Page 18] + +RFC 2026 Internet Standards Process October 1996 + + + on the Internet or on the suite of Internet protocols, the IESG may, + at its discretion, commission an independent technical review of the + specification. + + The IESG will send notice to the IETF of the pending IESG + consideration of the document(s) to permit a final review by the + general Internet community. This "Last-Call" notification shall be + via electronic mail to the IETF Announce mailing list. Comments on a + Last-Call shall be accepted from anyone, and should be sent as + directed in the Last-Call announcement. + + The Last-Call period shall be no shorter than two weeks except in + those cases where the proposed standards action was not initiated by + an IETF Working Group, in which case the Last-Call period shall be no + shorter than four weeks. If the IESG believes that the community + interest would be served by allowing more time for comment, it may + decide on a longer Last-Call period or to explicitly lengthen a + current Last-Call period. + + The IESG is not bound by the action recommended when the + specification was submitted. For example, the IESG may decide to + consider the specification for publication in a different category + than that requested. If the IESG determines this before the Last- + Call is issued then the Last-Call should reflect the IESG's view. + The IESG could also decide to change the publication category based + on the response to a Last-Call. If this decision would result in a + specification being published at a "higher" level than the original + Last-Call was for, a new Last-Call should be issued indicating the + IESG recommendation. In addition, the IESG may decide to recommend + the formation of a new Working Group in the case of significant + controversy in response to a Last-Call for specification not + originating from an IETF Working Group. + + In a timely fashion after the expiration of the Last-Call period, the + IESG shall make its final determination of whether or not to approve + the standards action, and shall notify the IETF of its decision via + electronic mail to the IETF Announce mailing list. + +6.1.3 Publication + + If a standards action is approved, notification is sent to the RFC + Editor and copied to the IETF with instructions to publish the + specification as an RFC. The specification shall at that point be + removed from the Internet-Drafts directory. + + + + + + + +Bradner Best Current Practice [Page 19] + +RFC 2026 Internet Standards Process October 1996 + + + An official summary of standards actions completed and pending shall + appear in each issue of the Internet Society's newsletter. This + shall constitute the "publication of record" for Internet standards + actions. + + The RFC Editor shall publish periodically an "Internet Official + Protocol Standards" RFC [1], summarizing the status of all Internet + protocol and service specifications. + +6.2 Advancing in the Standards Track + + The procedure described in section 6.1 is followed for each action + that attends the advancement of a specification along the standards + track. + + A specification shall remain at the Proposed Standard level for at + least six (6) months. + + A specification shall remain at the Draft Standard level for at least + four (4) months, or until at least one IETF meeting has occurred, + whichever comes later. + + These minimum periods are intended to ensure adequate opportunity for + community review without severely impacting timeliness. These + intervals shall be measured from the date of publication of the + corresponding RFC(s), or, if the action does not result in RFC + publication, the date of the announcement of the IESG approval of the + action. + + A specification may be (indeed, is likely to be) revised as it + advances through the standards track. At each stage, the IESG shall + determine the scope and significance of the revision to the + specification, and, if necessary and appropriate, modify the + recommended action. Minor revisions are expected, but a significant + revision may require that the specification accumulate more + experience at its current maturity level before progressing. Finally, + if the specification has been changed very significantly, the IESG + may recommend that the revision be treated as a new document, re- + entering the standards track at the beginning. + + Change of status shall result in republication of the specification + as an RFC, except in the rare case that there have been no changes at + all in the specification since the last publication. Generally, + desired changes will be "batched" for incorporation at the next level + in the standards track. However, deferral of changes to the next + standards action on the specification will not always be possible or + desirable; for example, an important typographical error, or a + technical error that does not represent a change in overall function + + + +Bradner Best Current Practice [Page 20] + +RFC 2026 Internet Standards Process October 1996 + + + of the specification, may need to be corrected immediately. In such + cases, the IESG or RFC Editor may be asked to republish the RFC (with + a new number) with corrections, and this will not reset the minimum + time-at-level clock. + + When a standards-track specification has not reached the Internet + Standard level but has remained at the same maturity level for + twenty-four (24) months, and every twelve (12) months thereafter + until the status is changed, the IESG shall review the viability of + the standardization effort responsible for that specification and the + usefulness of the technology. Following each such review, the IESG + shall approve termination or continuation of the development effort, + at the same time the IESG shall decide to maintain the specification + at the same maturity level or to move it to Historic status. This + decision shall be communicated to the IETF by electronic mail to the + IETF Announce mailing list to allow the Internet community an + opportunity to comment. This provision is not intended to threaten a + legitimate and active Working Group effort, but rather to provide an + administrative mechanism for terminating a moribund effort. + +6.3 Revising a Standard + + A new version of an established Internet Standard must progress + through the full Internet standardization process as if it were a + completely new specification. Once the new version has reached the + Standard level, it will usually replace the previous version, which + will be moved to Historic status. However, in some cases both + versions may remain as Internet Standards to honor the requirements + of an installed base. In this situation, the relationship between + the previous and the new versions must be explicitly stated in the + text of the new version or in another appropriate document (e.g., an + Applicability Statement; see section 3.2). + +6.4 Retiring a Standard + + As the technology changes and matures, it is possible for a new + Standard specification to be so clearly superior technically that one + or more existing standards track specifications for the same function + should be retired. In this case, or when it is felt for some other + reason that an existing standards track specification should be + retired, the IESG shall approve a change of status of the old + specification(s) to Historic. This recommendation shall be issued + with the same Last-Call and notification procedures used for any + other standards action. A request to retire an existing standard can + originate from a Working Group, an Area Director or some other + interested party. + + + + + +Bradner Best Current Practice [Page 21] + +RFC 2026 Internet Standards Process October 1996 + + +6.5 Conflict Resolution and Appeals + + Disputes are possible at various stages during the IETF process. As + much as possible the process is designed so that compromises can be + made, and genuine consensus achieved, however there are times when + even the most reasonable and knowledgeable people are unable to + agree. To achieve the goals of openness and fairness, such conflicts + must be resolved by a process of open review and discussion. This + section specifies the procedures that shall be followed to deal with + Internet standards issues that cannot be resolved through the normal + processes whereby IETF Working Groups and other Internet Standards + Process participants ordinarily reach consensus. + +6.5.1 Working Group Disputes + + An individual (whether a participant in the relevant Working Group or + not) may disagree with a Working Group recommendation based on his or + her belief that either (a) his or her own views have not been + adequately considered by the Working Group, or (b) the Working Group + has made an incorrect technical choice which places the quality + and/or integrity of the Working Group's product(s) in significant + jeopardy. The first issue is a difficulty with Working Group + process; the latter is an assertion of technical error. These two + types of disagreement are quite different, but both are handled by + the same process of review. + + A person who disagrees with a Working Group recommendation shall + always first discuss the matter with the Working Group's chair(s), + who may involve other members of the Working Group (or the Working + Group as a whole) in the discussion. + + If the disagreement cannot be resolved in this way, any of the + parties involved may bring it to the attention of the Area + Director(s) for the area in which the Working Group is chartered. + The Area Director(s) shall attempt to resolve the dispute. + + If the disagreement cannot be resolved by the Area Director(s) any of + the parties involved may then appeal to the IESG as a whole. The + IESG shall then review the situation and attempt to resolve it in a + manner of its own choosing. + + If the disagreement is not resolved to the satisfaction of the + parties at the IESG level, any of the parties involved may appeal the + decision to the IAB. The IAB shall then review the situation and + attempt to resolve it in a manner of its own choosing. + + + + + + +Bradner Best Current Practice [Page 22] + +RFC 2026 Internet Standards Process October 1996 + + + The IAB decision is final with respect to the question of whether or + not the Internet standards procedures have been followed and with + respect to all questions of technical merit. + +6.5.2 Process Failures + + This document sets forward procedures required to be followed to + ensure openness and fairness of the Internet Standards Process, and + the technical viability of the standards created. The IESG is the + principal agent of the IETF for this purpose, and it is the IESG that + is charged with ensuring that the required procedures have been + followed, and that any necessary prerequisites to a standards action + have been met. + + If an individual should disagree with an action taken by the IESG in + this process, that person should first discuss the issue with the + ISEG Chair. If the IESG Chair is unable to satisfy the complainant + then the IESG as a whole should re-examine the action taken, along + with input from the complainant, and determine whether any further + action is needed. The IESG shall issue a report on its review of the + complaint to the IETF. + + Should the complainant not be satisfied with the outcome of the IESG + review, an appeal may be lodged to the IAB. The IAB shall then review + the situation and attempt to resolve it in a manner of its own + choosing and report to the IETF on the outcome of its review. + + If circumstances warrant, the IAB may direct that an IESG decision be + annulled, and the situation shall then be as it was before the IESG + decision was taken. The IAB may also recommend an action to the IESG, + or make such other recommendations as it deems fit. The IAB may not, + however, pre-empt the role of the IESG by issuing a decision which + only the IESG is empowered to make. + + The IAB decision is final with respect to the question of whether or + not the Internet standards procedures have been followed. + +6.5.3 Questions of Applicable Procedure + + Further recourse is available only in cases in which the procedures + themselves (i.e., the procedures described in this document) are + claimed to be inadequate or insufficient to the protection of the + rights of all parties in a fair and open Internet Standards Process. + Claims on this basis may be made to the Internet Society Board of + Trustees. The President of the Internet Society shall acknowledge + such an appeal within two weeks, and shall at the time of + acknowledgment advise the petitioner of the expected duration of the + Trustees' review of the appeal. The Trustees shall review the + + + +Bradner Best Current Practice [Page 23] + +RFC 2026 Internet Standards Process October 1996 + + + situation in a manner of its own choosing and report to the IETF on + the outcome of its review. + + The Trustees' decision upon completion of their review shall be final + with respect to all aspects of the dispute. + +6.5.4 Appeals Procedure + + All appeals must include a detailed and specific description of the + facts of the dispute. + + All appeals must be initiated within two months of the public + knowledge of the action or decision to be challenged. + + At all stages of the appeals process, the individuals or bodies + responsible for making the decisions have the discretion to define + the specific procedures they will follow in the process of making + their decision. + + In all cases a decision concerning the disposition of the dispute, + and the communication of that decision to the parties involved, must + be accomplished within a reasonable period of time. + + [NOTE: These procedures intentionally and explicitly do not + establish a fixed maximum time period that shall be considered + "reasonable" in all cases. The Internet Standards Process places a + premium on consensus and efforts to achieve it, and deliberately + foregoes deterministically swift execution of procedures in favor of + a latitude within which more genuine technical agreements may be + reached.] + +7. EXTERNAL STANDARDS AND SPECIFICATIONS + + Many standards groups other than the IETF create and publish + standards documents for network protocols and services. When these + external specifications play an important role in the Internet, it is + desirable to reach common agreements on their usage -- i.e., to + establish Internet Standards relating to these external + specifications. + + There are two categories of external specifications: + + (1) Open Standards + + Various national and international standards bodies, such as ANSI, + ISO, IEEE, and ITU-T, develop a variety of protocol and service + specifications that are similar to Technical Specifications + defined here. National and international groups also publish + + + +Bradner Best Current Practice [Page 24] + +RFC 2026 Internet Standards Process October 1996 + + + "implementors' agreements" that are analogous to Applicability + Statements, capturing a body of implementation-specific detail + concerned with the practical application of their standards. All + of these are considered to be "open external standards" for the + purposes of the Internet Standards Process. + + (2) Other Specifications + + Other proprietary specifications that have come to be widely used + in the Internet may be treated by the Internet community as if + they were a "standards". Such a specification is not generally + developed in an open fashion, is typically proprietary, and is + controlled by the vendor, vendors, or organization that produced + it. + +7.1 Use of External Specifications + + To avoid conflict between competing versions of a specification, the + Internet community will not standardize a specification that is + simply an "Internet version" of an existing external specification + unless an explicit cooperative arrangement to do so has been made. + However, there are several ways in which an external specification + that is important for the operation and/or evolution of the Internet + may be adopted for Internet use. + +7.1.1 Incorporation of an Open Standard + + An Internet Standard TS or AS may incorporate an open external + standard by reference. For example, many Internet Standards + incorporate by reference the ANSI standard character set "ASCII" [2]. + Whenever possible, the referenced specification shall be available + online. + +7.1.2 Incorporation of Other Specifications + + Other proprietary specifications may be incorporated by reference to + a version of the specification as long as the proprietor meets the + requirements of section 10. If the other proprietary specification + is not widely and readily available, the IESG may request that it be + published as an Informational RFC. + + The IESG generally should not favor a particular proprietary + specification over technically equivalent and competing + specification(s) by making any incorporated vendor specification + "required" or "recommended". + + + + + + +Bradner Best Current Practice [Page 25] + +RFC 2026 Internet Standards Process October 1996 + + +7.1.3 Assumption + + An IETF Working Group may start from an external specification and + develop it into an Internet specification. This is acceptable if (1) + the specification is provided to the Working Group in compliance with + the requirements of section 10, and (2) change control has been + conveyed to IETF by the original developer of the specification for + the specification or for specifications derived from the original + specification. + +8. NOTICES AND RECORD KEEPING + + Each of the organizations involved in the development and approval of + Internet Standards shall publicly announce, and shall maintain a + publicly accessible record of, every activity in which it engages, to + the extent that the activity represents the prosecution of any part + of the Internet Standards Process. For purposes of this section, the + organizations involved in the development and approval of Internet + Standards includes the IETF, the IESG, the IAB, all IETF Working + Groups, and the Internet Society Board of Trustees. + + For IETF and Working Group meetings announcements shall be made by + electronic mail to the IETF Announce mailing list and shall be made + sufficiently far in advance of the activity to permit all interested + parties to effectively participate. The announcement shall contain + (or provide pointers to) all of the information that is necessary to + support the participation of any interested individual. In the case + of a meeting, for example, the announcement shall include an agenda + that specifies the standards- related issues that will be discussed. + + The formal record of an organization's standards-related activity + shall include at least the following: + + o the charter of the organization (or a defining document equivalent + to a charter); + o complete and accurate minutes of meetings; + o the archives of Working Group electronic mail mailing lists; and + o all written contributions from participants that pertain to the + organization's standards-related activity. + + As a practical matter, the formal record of all Internet Standards + Process activities is maintained by the IETF Secretariat, and is the + responsibility of the IETF Secretariat except that each IETF Working + Group is expected to maintain their own email list archive and must + make a best effort to ensure that all traffic is captured and + included in the archives. Also, the Working Group chair is + responsible for providing the IETF Secretariat with complete and + accurate minutes of all Working Group meetings. Internet-Drafts that + + + +Bradner Best Current Practice [Page 26] + +RFC 2026 Internet Standards Process October 1996 + + + have been removed (for any reason) from the Internet-Drafts + directories shall be archived by the IETF Secretariat for the sole + purpose of preserving an historical record of Internet standards + activity and thus are not retrievable except in special + circumstances. + +9. VARYING THE PROCESS + + This document, which sets out the rules and procedures by which + Internet Standards and related documents are made is itself a product + of the Internet Standards Process (as a BCP, as described in section + 5). It replaces a previous version, and in time, is likely itself to + be replaced. + + While, when published, this document represents the community's view + of the proper and correct process to follow, and requirements to be + met, to allow for the best possible Internet Standards and BCPs, it + cannot be assumed that this will always remain the case. From time to + time there may be a desire to update it, by replacing it with a new + version. Updating this document uses the same open procedures as are + used for any other BCP. + + In addition, there may be situations where following the procedures + leads to a deadlock about a specific specification, or there may be + situations where the procedures provide no guidance. In these cases + it may be appropriate to invoke the variance procedure described + below. + +9.1 The Variance Procedure + + Upon the recommendation of the responsible IETF Working Group (or, if + no Working Group is constituted, upon the recommendation of an ad hoc + committee), the IESG may enter a particular specification into, or + advance it within, the standards track even though some of the + requirements of this document have not or will not be met. The IESG + may approve such a variance, however, only if it first determines + that the likely benefits to the Internet community are likely to + outweigh any costs to the Internet community that result from + noncompliance with the requirements in this document. In exercising + this discretion, the IESG shall at least consider (a) the technical + merit of the specification, (b) the possibility of achieving the + goals of the Internet Standards Process without granting a variance, + (c) alternatives to the granting of a variance, (d) the collateral + and precedential effects of granting a variance, and (e) the IESG's + ability to craft a variance that is as narrow as possible. In + determining whether to approve a variance, the IESG has discretion to + limit the scope of the variance to particular parts of this document + and to impose such additional restrictions or limitations as it + + + +Bradner Best Current Practice [Page 27] + +RFC 2026 Internet Standards Process October 1996 + + + determines appropriate to protect the interests of the Internet + community. + + The proposed variance must detail the problem perceived, explain the + precise provision of this document which is causing the need for a + variance, and the results of the IESG's considerations including + consideration of points (a) through (d) in the previous paragraph. + The proposed variance shall be issued as an Internet Draft. The IESG + shall then issue an extended Last-Call, of no less than 4 weeks, to + allow for community comment upon the proposal. + + In a timely fashion after the expiration of the Last-Call period, the + IESG shall make its final determination of whether or not to approve + the proposed variance, and shall notify the IETF of its decision via + electronic mail to the IETF Announce mailing list. If the variance + is approved it shall be forwarded to the RFC Editor with a request + that it be published as a BCP. + + This variance procedure is for use when a one-time waving of some + provision of this document is felt to be required. Permanent changes + to this document shall be accomplished through the normal BCP + process. + + The appeals process in section 6.5 applies to this process. + +9.2 Exclusions + + No use of this procedure may lower any specified delays, nor exempt + any proposal from the requirements of openness, fairness, or + consensus, nor from the need to keep proper records of the meetings + and mailing list discussions. + + Specifically, the following sections of this document must not be + subject of a variance: 5.1, 6.1, 6.1.1 (first paragraph), 6.1.2, 6.3 + (first sentence), 6.5 and 9. + +10. INTELLECTUAL PROPERTY RIGHTS + +10.1. General Policy + + In all matters of intellectual property rights and procedures, the + intention is to benefit the Internet community and the public at + large, while respecting the legitimate rights of others. + + + + + + + + +Bradner Best Current Practice [Page 28] + +RFC 2026 Internet Standards Process October 1996 + + +10.2 Confidentiality Obligations + + No contribution that is subject to any requirement of confidentiality + or any restriction on its dissemination may be considered in any part + of the Internet Standards Process, and there must be no assumption of + any confidentiality obligation with respect to any such contribution. + +10.3. Rights and Permissions + + In the course of standards work, the IETF receives contributions in + various forms and from many persons. To best facilitate the + dissemination of these contributions, it is necessary to understand + any intellectual property rights (IPR) relating to the contributions. + +10.3.1. All Contributions + + By submission of a contribution, each person actually submitting the + contribution is deemed to agree to the following terms and conditions + on his own behalf, on behalf of the organization (if any) he + represents and on behalf of the owners of any propriety rights in the + contribution.. Where a submission identifies contributors in + addition to the contributor(s) who provide the actual submission, the + actual submitter(s) represent that each other named contributor was + made aware of and agreed to accept the same terms and conditions on + his own behalf, on behalf of any organization he may represent and + any known owner of any proprietary rights in the contribution. + + l. Some works (e.g. works of the U.S. Government) are not subject to + copyright. However, to the extent that the submission is or may + be subject to copyright, the contributor, the organization he + represents (if any) and the owners of any proprietary rights in + the contribution, grant an unlimited perpetual, non-exclusive, + royalty-free, world-wide right and license to the ISOC and the + IETF under any copyrights in the contribution. This license + includes the right to copy, publish and distribute the + contribution in any way, and to prepare derivative works that are + based on or incorporate all or part of the contribution, the + license to such derivative works to be of the same scope as the + license of the original contribution. + + 2. The contributor acknowledges that the ISOC and IETF have no duty + to publish or otherwise use or disseminate any contribution. + + 3. The contributor grants permission to reference the name(s) and + address(es) of the contributor(s) and of the organization(s) he + represents (if any). + + + + + +Bradner Best Current Practice [Page 29] + +RFC 2026 Internet Standards Process October 1996 + + + 4. The contributor represents that contribution properly acknowledge + major contributors. + + 5. The contribuitor, the organization (if any) he represents and the + owners of any proprietary rights in the contribution, agree that + no information in the contribution is confidential and that the + ISOC and its affiliated organizations may freely disclose any + information in the contribution. + + 6. The contributor represents that he has disclosed the existence of + any proprietary or intellectual property rights in the + contribution that are reasonably and personally known to the + contributor. The contributor does not represent that he + personally knows of all potentially pertinent proprietary and + intellectual property rights owned or claimed by the organization + he represents (if any) or third parties. + + 7. The contributor represents that there are no limits to the + contributor's ability to make the grants acknowledgments and + agreements above that are reasonably and personally known to the + contributor. + + By ratifying this description of the IETF process the Internet + Society warrants that it will not inhibit the traditional open and + free access to IETF documents for which license and right have + been assigned according to the procedures set forth in this + section, including Internet-Drafts and RFCs. This warrant is + perpetual and will not be revoked by the Internet Society or its + successors or assigns. + +10.3.2. Standards Track Documents + + (A) Where any patents, patent applications, or other proprietary + rights are known, or claimed, with respect to any specification on + the standards track, and brought to the attention of the IESG, the + IESG shall not advance the specification without including in the + document a note indicating the existence of such rights, or + claimed rights. Where implementations are required before + advancement of a specification, only implementations that have, by + statement of the implementors, taken adequate steps to comply with + any such rights, or claimed rights, shall be considered for the + purpose of showing the adequacy of the specification. + (B) The IESG disclaims any responsibility for identifying the + existence of or for evaluating the applicability of any claimed + copyrights, patents, patent applications, or other rights in the + fulfilling of the its obligations under (A), and will take no + position on the validity or scope of any such rights. + + + + +Bradner Best Current Practice [Page 30] + +RFC 2026 Internet Standards Process October 1996 + + + (C) Where the IESG knows of rights, or claimed rights under (A), the + IETF Executive Director shall attempt to obtain from the claimant + of such rights, a written assurance that upon approval by the IESG + of the relevant Internet standards track specification(s), any + party will be able to obtain the right to implement, use and + distribute the technology or works when implementing, using or + distributing technology based upon the specific specification(s) + under openly specified, reasonable, non-discriminatory terms. + The Working Group proposing the use of the technology with respect + to which the proprietary rights are claimed may assist the IETF + Executive Director in this effort. The results of this procedure + shall not affect advancement of a specification along the + standards track, except that the IESG may defer approval where a + delay may facilitate the obtaining of such assurances. The + results will, however, be recorded by the IETF Executive Director, + and made available. The IESG may also direct that a summary of + the results be included in any RFC published containing the + specification. + +10.3.3 Determination of Reasonable and Non-discriminatory Terms + + The IESG will not make any explicit determination that the assurance + of reasonable and non-discriminatory terms for the use of a + technology has been fulfilled in practice. It will instead use the + normal requirements for the advancement of Internet Standards to + verify that the terms for use are reasonable. If the two unrelated + implementations of the specification that are required to advance + from Proposed Standard to Draft Standard have been produced by + different organizations or individuals or if the "significant + implementation and successful operational experience" required to + advance from Draft Standard to Standard has been achieved the + assumption is that the terms must be reasonable and to some degree, + non-discriminatory. This assumption may be challenged during the + Last-Call period. + +10.4. Notices + + (A) Standards track documents shall include the following notice: + + "The IETF takes no position regarding the validity or scope of + any intellectual property or other rights that might be claimed + to pertain to the implementation or use of the technology + described in this document or the extent to which any license + under such rights might or might not be available; neither does + it represent that it has made any effort to identify any such + rights. Information on the IETF's procedures with respect to + rights in standards-track and standards-related documentation + can be found in BCP-11. Copies of claims of rights made + + + +Bradner Best Current Practice [Page 31] + +RFC 2026 Internet Standards Process October 1996 + + + available for publication and any assurances of licenses to + be made available, or the result of an attempt made + to obtain a general license or permission for the use of such + proprietary rights by implementors or users of this + specification can be obtained from the IETF Secretariat." + + (B) The IETF encourages all interested parties to bring to its + attention, at the earliest possible time, the existence of any + intellectual property rights pertaining to Internet Standards. + For this purpose, each standards document shall include the + following invitation: + + "The IETF invites any interested party to bring to its + attention any copyrights, patents or patent applications, or + other proprietary rights which may cover technology that may be + required to practice this standard. Please address the + information to the IETF Executive Director." + + (C) The following copyright notice and disclaimer shall be included + in all ISOC standards-related documentation: + + "Copyright (C) The Internet Society (date). All Rights + Reserved. + + This document and translations of it may be copied and + furnished to others, and derivative works that comment on or + otherwise explain it or assist in its implmentation may be + prepared, copied, published and distributed, in whole or in + part, without restriction of any kind, provided that the above + copyright notice and this paragraph are included on all such + copies and derivative works. However, this document itself may + not be modified in any way, such as by removing the copyright + notice or references to the Internet Society or other Internet + organizations, except as needed for the purpose of developing + Internet standards in which case the procedures for copyrights + defined in the Internet Standards process must be followed, or + as required to translate it into languages other than English. + + The limited permissions granted above are perpetual and will + not be revoked by the Internet Society or its successors or + assigns. + + + + + + + + + + +Bradner Best Current Practice [Page 32] + +RFC 2026 Internet Standards Process October 1996 + + + This document and the information contained herein is provided + on an "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET + ENGINEERING TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE + OF THE INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY + IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A + PARTICULAR PURPOSE." + + (D) Where the IESG is aware at the time of publication of + proprietary rights claimed with respect to a standards track + document, or the technology described or referenced therein, such + document shall contain the following notice: + + "The IETF has been notified of intellectual property rights + claimed in regard to some or all of the specification contained + in this document. For more information consult the online list + of claimed rights." + +11. ACKNOWLEDGMENTS + + There have been a number of people involved with the development of + the documents defining the IETF Standards Process over the years. + The process was first described in RFC 1310 then revised in RFC 1602 + before the current effort (which relies heavily on its predecessors). + Specific acknowledgments must be extended to Lyman Chapin, Phill + Gross and Christian Huitema as the editors of the previous versions, + to Jon Postel and Dave Crocker for their inputs to those versions, to + Andy Ireland, Geoff Stewart, Jim Lampert, and Dick Holleman for their + reviews of the legal aspects of the procedures described herein, and + to John Stewart, Robert Elz and Steve Coya for their extensive input + on the final version. + + In addition much of the credit for the refinement of the details of + the IETF processes belongs to the many members of the various + incarnations of the POISED Working Group. + +12. SECURITY CONSIDERATIONS + + Security issues are not discussed in this memo. + + + + + + + + + + + + +Bradner Best Current Practice [Page 33] + +RFC 2026 Internet Standards Process October 1996 + + +13. REFERENCES + + [1] Postel, J., "Internet Official Protocol Standards", STD 1, + USC/Information Sciences Institute, March 1996. + + [2] ANSI, Coded Character Set -- 7-Bit American Standard Code for + Information Interchange, ANSI X3.4-1986. + + [3] Reynolds, J., and J. Postel, "Assigned Numbers", STD 2, + USC/Information Sciences Institute, October 1994. + + [4] Postel, J., "Introduction to the STD Notes", RFC 1311, + USC/Information Sciences Institute, March 1992. + + [5] Postel, J., "Instructions to RFC Authors", RFC 1543, + USC/Information Sciences Institute, October 1993. + + [6] Huitema, C., J. Postel, and S. Crocker "Not All RFCs are + Standards", RFC 1796, April 1995. + +14. DEFINITIONS OF TERMS + + IETF Area - A management division within the IETF. An Area consists + of Working Groups related to a general topic such as routing. An + Area is managed by one or two Area Directors. + Area Director - The manager of an IETF Area. The Area Directors + along with the IETF Chair comprise the Internet Engineering + Steering Group (IESG). + File Transfer Protocol (FTP) - An Internet application used to + transfer files in a TCP/IP network. + gopher - An Internet application used to interactively select and + retrieve files in a TCP/IP network. + Internet Architecture Board (IAB) - An appointed group that assists + in the management of the IETF standards process. + Internet Engineering Steering Group (IESG) - A group comprised of the + IETF Area Directors and the IETF Chair. The IESG is responsible + for the management, along with the IAB, of the IETF and is the + standards approval board for the IETF. + interoperable - For the purposes of this document, "interoperable" + means to be able to interoperate over a data communications path. + Last-Call - A public comment period used to gage the level of + consensus about the reasonableness of a proposed standards action. + (see section 6.1.2) + + + + + + + + +Bradner Best Current Practice [Page 34] + +RFC 2026 Internet Standards Process October 1996 + + + online - Relating to information made available over the Internet. + When referenced in this document material is said to be online + when it is retrievable without restriction or undue fee using + standard Internet applications such as anonymous FTP, gopher or + the WWW. + Working Group - A group chartered by the IESG and IAB to work on a + specific specification, set of specifications or topic. + +15. AUTHOR'S ADDRESS + + Scott O. Bradner + Harvard University + Holyoke Center, Room 813 + 1350 Mass. Ave. + Cambridge, MA 02138 + USA + + Phone: +1 617 495 3864 + EMail: sob@harvard.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Bradner Best Current Practice [Page 35] + +RFC 2026 Internet Standards Process October 1996 + + +APPENDIX A: GLOSSARY OF ACRONYMS + + ANSI: American National Standards Institute + ARPA: (U.S.) Advanced Research Projects Agency + AS: Applicability Statement + FTP: File Transfer Protocol + ASCII: American Standard Code for Information Interchange + ITU-T: Telecommunications Standardization sector of the + International Telecommunication Union (ITU), a UN + treaty organization; ITU-T was formerly called CCITT. + IAB: Internet Architecture Board + IANA: Internet Assigned Numbers Authority + IEEE: Institute of Electrical and Electronics Engineers + ICMP: Internet Control Message Protocol + IESG: Internet Engineering Steering Group + IETF: Internet Engineering Task Force + IP: Internet Protocol + IRSG Internet Research Steering Group + IRTF: Internet Research Task Force + ISO: International Organization for Standardization + ISOC: Internet Society + MIB: Management Information Base + OSI: Open Systems Interconnection + RFC: Request for Comments + TCP: Transmission Control Protocol + TS: Technical Specification + WWW: World Wide Web + + + + + + + + + + + + + + + + + + + + + + + + +Bradner Best Current Practice [Page 36] + diff --git a/ext/picotcp/RFC/rfc2131.txt b/ext/picotcp/RFC/rfc2131.txt new file mode 100644 index 0000000..f45d9b8 --- /dev/null +++ b/ext/picotcp/RFC/rfc2131.txt @@ -0,0 +1,2523 @@ + + + + + + +Network Working Group R. Droms +Request for Comments: 2131 Bucknell University +Obsoletes: 1541 March 1997 +Category: Standards Track + + Dynamic Host Configuration Protocol + +Status of this memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + +Abstract + + The Dynamic Host Configuration Protocol (DHCP) provides a framework + for passing configuration information to hosts on a TCPIP network. + DHCP is based on the Bootstrap Protocol (BOOTP) [7], adding the + capability of automatic allocation of reusable network addresses and + additional configuration options [19]. DHCP captures the behavior of + BOOTP relay agents [7, 21], and DHCP participants can interoperate + with BOOTP participants [9]. + +Table of Contents + + 1. Introduction. . . . . . . . . . . . . . . . . . . . . . . . . 2 + 1.1 Changes to RFC1541. . . . . . . . . . . . . . . . . . . . . . 3 + 1.2 Related Work. . . . . . . . . . . . . . . . . . . . . . . . . 4 + 1.3 Problem definition and issues . . . . . . . . . . . . . . . . 4 + 1.4 Requirements. . . . . . . . . . . . . . . . . . . . . . . . . 5 + 1.5 Terminology . . . . . . . . . . . . . . . . . . . . . . . . . 6 + 1.6 Design goals. . . . . . . . . . . . . . . . . . . . . . . . . 6 + 2. Protocol Summary. . . . . . . . . . . . . . . . . . . . . . . 8 + 2.1 Configuration parameters repository . . . . . . . . . . . . . 11 + 2.2 Dynamic allocation of network addresses . . . . . . . . . . . 12 + 3. The Client-Server Protocol. . . . . . . . . . . . . . . . . . 13 + 3.1 Client-server interaction - allocating a network address. . . 13 + 3.2 Client-server interaction - reusing a previously allocated + network address . . . . . . . . . . . . . . . . . . . . . . . 17 + 3.3 Interpretation and representation of time values. . . . . . . 20 + 3.4 Obtaining parameters with externally configured network + address . . . . . . . . . . . . . . . . . . . . . . . . . . . 20 + 3.5 Client parameters in DHCP . . . . . . . . . . . . . . . . . . 21 + 3.6 Use of DHCP in clients with multiple interfaces . . . . . . . 22 + 3.7 When clients should use DHCP. . . . . . . . . . . . . . . . . 22 + 4. Specification of the DHCP client-server protocol. . . . . . . 22 + + + +Droms Standards Track [Page 1] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + 4.1 Constructing and sending DHCP messages. . . . . . . . . . . . 22 + 4.2 DHCP server administrative controls . . . . . . . . . . . . . 25 + 4.3 DHCP server behavior. . . . . . . . . . . . . . . . . . . . . 26 + 4.4 DHCP client behavior. . . . . . . . . . . . . . . . . . . . . 34 + 5. Acknowledgments. . . . . . . . . . . . . . . . . . . . . . . .42 + 6. References . . . . . . . . . . . . . . . . . . . . . . . . . .42 + 7. Security Considerations. . . . . . . . . . . . . . . . . . . .43 + 8. Author's Address . . . . . . . . . . . . . . . . . . . . . . .44 + A. Host Configuration Parameters . . . . . . . . . . . . . . . .45 +List of Figures + 1. Format of a DHCP message . . . . . . . . . . . . . . . . . . . 9 + 2. Format of the 'flags' field. . . . . . . . . . . . . . . . . . 11 + 3. Timeline diagram of messages exchanged between DHCP client and + servers when allocating a new network address. . . . . . . . . 15 + 4. Timeline diagram of messages exchanged between DHCP client and + servers when reusing a previously allocated network address. . 18 + 5. State-transition diagram for DHCP clients. . . . . . . . . . . 34 +List of Tables + 1. Description of fields in a DHCP message. . . . . . . . . . . . 10 + 2. DHCP messages. . . . . . . . . . . . . . . . . . . . . . . . . 14 + 3. Fields and options used by DHCP servers. . . . . . . . . . . . 28 + 4. Client messages from various states. . . . . . . . . . . . . . 33 + 5. Fields and options used by DHCP clients. . . . . . . . . . . . 37 + +1. Introduction + + The Dynamic Host Configuration Protocol (DHCP) provides configuration + parameters to Internet hosts. DHCP consists of two components: a + protocol for delivering host-specific configuration parameters from a + DHCP server to a host and a mechanism for allocation of network + addresses to hosts. + + DHCP is built on a client-server model, where designated DHCP server + hosts allocate network addresses and deliver configuration parameters + to dynamically configured hosts. Throughout the remainder of this + document, the term "server" refers to a host providing initialization + parameters through DHCP, and the term "client" refers to a host + requesting initialization parameters from a DHCP server. + + A host should not act as a DHCP server unless explicitly configured + to do so by a system administrator. The diversity of hardware and + protocol implementations in the Internet would preclude reliable + operation if random hosts were allowed to respond to DHCP requests. + For example, IP requires the setting of many parameters within the + protocol implementation software. Because IP can be used on many + dissimilar kinds of network hardware, values for those parameters + cannot be guessed or assumed to have correct defaults. Also, + distributed address allocation schemes depend on a polling/defense + + + +Droms Standards Track [Page 2] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + mechanism for discovery of addresses that are already in use. IP + hosts may not always be able to defend their network addresses, so + that such a distributed address allocation scheme cannot be + guaranteed to avoid allocation of duplicate network addresses. + + DHCP supports three mechanisms for IP address allocation. In + "automatic allocation", DHCP assigns a permanent IP address to a + client. In "dynamic allocation", DHCP assigns an IP address to a + client for a limited period of time (or until the client explicitly + relinquishes the address). In "manual allocation", a client's IP + address is assigned by the network administrator, and DHCP is used + simply to convey the assigned address to the client. A particular + network will use one or more of these mechanisms, depending on the + policies of the network administrator. + + Dynamic allocation is the only one of the three mechanisms that + allows automatic reuse of an address that is no longer needed by the + client to which it was assigned. Thus, dynamic allocation is + particularly useful for assigning an address to a client that will be + connected to the network only temporarily or for sharing a limited + pool of IP addresses among a group of clients that do not need + permanent IP addresses. Dynamic allocation may also be a good choice + for assigning an IP address to a new client being permanently + connected to a network where IP addresses are sufficiently scarce + that it is important to reclaim them when old clients are retired. + Manual allocation allows DHCP to be used to eliminate the error-prone + process of manually configuring hosts with IP addresses in + environments where (for whatever reasons) it is desirable to manage + IP address assignment outside of the DHCP mechanisms. + + The format of DHCP messages is based on the format of BOOTP messages, + to capture the BOOTP relay agent behavior described as part of the + BOOTP specification [7, 21] and to allow interoperability of existing + BOOTP clients with DHCP servers. Using BOOTP relay agents eliminates + the necessity of having a DHCP server on each physical network + segment. + +1.1 Changes to RFC 1541 + + This document updates the DHCP protocol specification that appears in + RFC1541. A new DHCP message type, DHCPINFORM, has been added; see + section 3.4, 4.3 and 4.4 for details. The classing mechanism for + identifying DHCP clients to DHCP servers has been extended to include + "vendor" classes as defined in sections 4.2 and 4.3. The minimum + lease time restriction has been removed. Finally, many editorial + changes have been made to clarify the text as a result of experience + gained in DHCP interoperability tests. + + + + +Droms Standards Track [Page 3] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +1.2 Related Work + + There are several Internet protocols and related mechanisms that + address some parts of the dynamic host configuration problem. The + Reverse Address Resolution Protocol (RARP) [10] (through the + extensions defined in the Dynamic RARP (DRARP) [5]) explicitly + addresses the problem of network address discovery, and includes an + automatic IP address assignment mechanism. The Trivial File Transfer + Protocol (TFTP) [20] provides for transport of a boot image from a + boot server. The Internet Control Message Protocol (ICMP) [16] + provides for informing hosts of additional routers via "ICMP + redirect" messages. ICMP also can provide subnet mask information + through the "ICMP mask request" message and other information through + the (obsolete) "ICMP information request" message. Hosts can locate + routers through the ICMP router discovery mechanism [8]. + + BOOTP is a transport mechanism for a collection of configuration + information. BOOTP is also extensible, and official extensions [17] + have been defined for several configuration parameters. Morgan has + proposed extensions to BOOTP for dynamic IP address assignment [15]. + The Network Information Protocol (NIP), used by the Athena project at + MIT, is a distributed mechanism for dynamic IP address assignment + [19]. The Resource Location Protocol RLP [1] provides for location + of higher level services. Sun Microsystems diskless workstations use + a boot procedure that employs RARP, TFTP and an RPC mechanism called + "bootparams" to deliver configuration information and operating + system code to diskless hosts. (Sun Microsystems, Sun Workstation + and SunOS are trademarks of Sun Microsystems, Inc.) Some Sun + networks also use DRARP and an auto-installation mechanism to + automate the configuration of new hosts in an existing network. + + In other related work, the path minimum transmission unit (MTU) + discovery algorithm can determine the MTU of an arbitrary internet + path [14]. The Address Resolution Protocol (ARP) has been proposed + as a transport protocol for resource location and selection [6]. + Finally, the Host Requirements RFCs [3, 4] mention specific + requirements for host reconfiguration and suggest a scenario for + initial configuration of diskless hosts. + +1.3 Problem definition and issues + + DHCP is designed to supply DHCP clients with the configuration + parameters defined in the Host Requirements RFCs. After obtaining + parameters via DHCP, a DHCP client should be able to exchange packets + with any other host in the Internet. The TCP/IP stack parameters + supplied by DHCP are listed in Appendix A. + + + + + +Droms Standards Track [Page 4] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + Not all of these parameters are required for a newly initialized + client. A client and server may negotiate for the transmission of + only those parameters required by the client or specific to a + particular subnet. + + DHCP allows but does not require the configuration of client + parameters not directly related to the IP protocol. DHCP also does + not address registration of newly configured clients with the Domain + Name System (DNS) [12, 13]. + + DHCP is not intended for use in configuring routers. + +1.4 Requirements + + Throughout this document, the words that are used to define the + significance of particular requirements are capitalized. These words + are: + + o "MUST" + + This word or the adjective "REQUIRED" means that the + item is an absolute requirement of this specification. + + o "MUST NOT" + + This phrase means that the item is an absolute prohibition + of this specification. + + o "SHOULD" + + This word or the adjective "RECOMMENDED" means that there + may exist valid reasons in particular circumstances to ignore + this item, but the full implications should be understood and + the case carefully weighed before choosing a different course. + + o "SHOULD NOT" + + This phrase means that there may exist valid reasons in + particular circumstances when the listed behavior is acceptable + or even useful, but the full implications should be understood + and the case carefully weighed before implementing any behavior + described with this label. + + + + + + + + + +Droms Standards Track [Page 5] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + o "MAY" + + This word or the adjective "OPTIONAL" means that this item is + truly optional. One vendor may choose to include the item + because a particular marketplace requires it or because it + enhances the product, for example; another vendor may omit the + same item. + +1.5 Terminology + + This document uses the following terms: + + o "DHCP client" + + A DHCP client is an Internet host using DHCP to obtain + configuration parameters such as a network address. + + o "DHCP server" + + A DHCP server is an Internet host that returns configuration + parameters to DHCP clients. + + o "BOOTP relay agent" + + A BOOTP relay agent or relay agent is an Internet host or router + that passes DHCP messages between DHCP clients and DHCP servers. + DHCP is designed to use the same relay agent behavior as specified + in the BOOTP protocol specification. + + o "binding" + + A binding is a collection of configuration parameters, including + at least an IP address, associated with or "bound to" a DHCP + client. Bindings are managed by DHCP servers. + +1.6 Design goals + + The following list gives general design goals for DHCP. + + o DHCP should be a mechanism rather than a policy. DHCP must + allow local system administrators control over configuration + parameters where desired; e.g., local system administrators + should be able to enforce local policies concerning allocation + and access to local resources where desired. + + + + + + + +Droms Standards Track [Page 6] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + o Clients should require no manual configuration. Each client + should be able to discover appropriate local configuration + parameters without user intervention and incorporate those + parameters into its own configuration. + + o Networks should require no manual configuration for individual + clients. Under normal circumstances, the network manager + should not have to enter any per-client configuration + parameters. + + o DHCP should not require a server on each subnet. To allow for + scale and economy, DHCP must work across routers or through the + intervention of BOOTP relay agents. + + o A DHCP client must be prepared to receive multiple responses + to a request for configuration parameters. Some installations + may include multiple, overlapping DHCP servers to enhance + reliability and increase performance. + + o DHCP must coexist with statically configured, non-participating + hosts and with existing network protocol implementations. + + o DHCP must interoperate with the BOOTP relay agent behavior as + described by RFC 951 and by RFC 1542 [21]. + + o DHCP must provide service to existing BOOTP clients. + + The following list gives design goals specific to the transmission of + the network layer parameters. DHCP must: + + o Guarantee that any specific network address will not be in + use by more than one DHCP client at a time, + + o Retain DHCP client configuration across DHCP client reboot. A + DHCP client should, whenever possible, be assigned the same + configuration parameters (e.g., network address) in response + to each request, + + o Retain DHCP client configuration across server reboots, and, + whenever possible, a DHCP client should be assigned the same + configuration parameters despite restarts of the DHCP mechanism, + + o Allow automated assignment of configuration parameters to new + clients to avoid hand configuration for new clients, + + o Support fixed or permanent allocation of configuration + parameters to specific clients. + + + + +Droms Standards Track [Page 7] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +2. Protocol Summary + + From the client's point of view, DHCP is an extension of the BOOTP + mechanism. This behavior allows existing BOOTP clients to + interoperate with DHCP servers without requiring any change to the + clients' initialization software. RFC 1542 [2] details the + interactions between BOOTP and DHCP clients and servers [9]. There + are some new, optional transactions that optimize the interaction + between DHCP clients and servers that are described in sections 3 and + 4. + + Figure 1 gives the format of a DHCP message and table 1 describes + each of the fields in the DHCP message. The numbers in parentheses + indicate the size of each field in octets. The names for the fields + given in the figure will be used throughout this document to refer to + the fields in DHCP messages. + + There are two primary differences between DHCP and BOOTP. First, + DHCP defines mechanisms through which clients can be assigned a + network address for a finite lease, allowing for serial reassignment + of network addresses to different clients. Second, DHCP provides the + mechanism for a client to acquire all of the IP configuration + parameters that it needs in order to operate. + + DHCP introduces a small change in terminology intended to clarify the + meaning of one of the fields. What was the "vendor extensions" field + in BOOTP has been re-named the "options" field in DHCP. Similarly, + the tagged data items that were used inside the BOOTP "vendor + extensions" field, which were formerly referred to as "vendor + extensions," are now termed simply "options." + + + + + + + + + + + + + + + + + + + + + +Droms Standards Track [Page 8] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | op (1) | htype (1) | hlen (1) | hops (1) | + +---------------+---------------+---------------+---------------+ + | xid (4) | + +-------------------------------+-------------------------------+ + | secs (2) | flags (2) | + +-------------------------------+-------------------------------+ + | ciaddr (4) | + +---------------------------------------------------------------+ + | yiaddr (4) | + +---------------------------------------------------------------+ + | siaddr (4) | + +---------------------------------------------------------------+ + | giaddr (4) | + +---------------------------------------------------------------+ + | | + | chaddr (16) | + | | + | | + +---------------------------------------------------------------+ + | | + | sname (64) | + +---------------------------------------------------------------+ + | | + | file (128) | + +---------------------------------------------------------------+ + | | + | options (variable) | + +---------------------------------------------------------------+ + + Figure 1: Format of a DHCP message + + DHCP defines a new 'client identifier' option that is used to pass an + explicit client identifier to a DHCP server. This change eliminates + the overloading of the 'chaddr' field in BOOTP messages, where + 'chaddr' is used both as a hardware address for transmission of BOOTP + reply messages and as a client identifier. The 'client identifier' + is an opaque key, not to be interpreted by the server; for example, + the 'client identifier' may contain a hardware address, identical to + the contents of the 'chaddr' field, or it may contain another type of + identifier, such as a DNS name. The 'client identifier' chosen by a + DHCP client MUST be unique to that client within the subnet to which + the client is attached. If the client uses a 'client identifier' in + one message, it MUST use that same identifier in all subsequent + messages, to ensure that all servers correctly identify the client. + + + + +Droms Standards Track [Page 9] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + DHCP clarifies the interpretation of the 'siaddr' field as the + address of the server to use in the next step of the client's + bootstrap process. A DHCP server may return its own address in the + 'siaddr' field, if the server is prepared to supply the next + bootstrap service (e.g., delivery of an operating system executable + image). A DHCP server always returns its own address in the 'server + identifier' option. + + FIELD OCTETS DESCRIPTION + ----- ------ ----------- + + op 1 Message op code / message type. + 1 = BOOTREQUEST, 2 = BOOTREPLY + htype 1 Hardware address type, see ARP section in "Assigned + Numbers" RFC; e.g., '1' = 10mb ethernet. + hlen 1 Hardware address length (e.g. '6' for 10mb + ethernet). + hops 1 Client sets to zero, optionally used by relay agents + when booting via a relay agent. + xid 4 Transaction ID, a random number chosen by the + client, used by the client and server to associate + messages and responses between a client and a + server. + secs 2 Filled in by client, seconds elapsed since client + began address acquisition or renewal process. + flags 2 Flags (see figure 2). + ciaddr 4 Client IP address; only filled in if client is in + BOUND, RENEW or REBINDING state and can respond + to ARP requests. + yiaddr 4 'your' (client) IP address. + siaddr 4 IP address of next server to use in bootstrap; + returned in DHCPOFFER, DHCPACK by server. + giaddr 4 Relay agent IP address, used in booting via a + relay agent. + chaddr 16 Client hardware address. + sname 64 Optional server host name, null terminated string. + file 128 Boot file name, null terminated string; "generic" + name or null in DHCPDISCOVER, fully qualified + directory-path name in DHCPOFFER. + options var Optional parameters field. See the options + documents for a list of defined options. + + Table 1: Description of fields in a DHCP message + + The 'options' field is now variable length. A DHCP client must be + prepared to receive DHCP messages with an 'options' field of at least + length 312 octets. This requirement implies that a DHCP client must + be prepared to receive a message of up to 576 octets, the minimum IP + + + +Droms Standards Track [Page 10] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + datagram size an IP host must be prepared to accept [3]. DHCP + clients may negotiate the use of larger DHCP messages through the + 'maximum DHCP message size' option. The options field may be further + extended into the 'file' and 'sname' fields. + + In the case of a client using DHCP for initial configuration (before + the client's TCP/IP software has been completely configured), DHCP + requires creative use of the client's TCP/IP software and liberal + interpretation of RFC 1122. The TCP/IP software SHOULD accept and + forward to the IP layer any IP packets delivered to the client's + hardware address before the IP address is configured; DHCP servers + and BOOTP relay agents may not be able to deliver DHCP messages to + clients that cannot accept hardware unicast datagrams before the + TCP/IP software is configured. + + To work around some clients that cannot accept IP unicast datagrams + before the TCP/IP software is configured as discussed in the previous + paragraph, DHCP uses the 'flags' field [21]. The leftmost bit is + defined as the BROADCAST (B) flag. The semantics of this flag are + discussed in section 4.1 of this document. The remaining bits of the + flags field are reserved for future use. They MUST be set to zero by + clients and ignored by servers and relay agents. Figure 2 gives the + format of the 'flags' field. + + 1 1 1 1 1 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |B| MBZ | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + B: BROADCAST flag + + MBZ: MUST BE ZERO (reserved for future use) + + Figure 2: Format of the 'flags' field + +2.1 Configuration parameters repository + + The first service provided by DHCP is to provide persistent storage + of network parameters for network clients. The model of DHCP + persistent storage is that the DHCP service stores a key-value entry + for each client, where the key is some unique identifier (for + example, an IP subnet number and a unique identifier within the + subnet) and the value contains the configuration parameters for the + client. + + For example, the key might be the pair (IP-subnet-number, hardware- + address) (note that the "hardware-address" should be typed by the + + + +Droms Standards Track [Page 11] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + type of hardware to accommodate possible duplication of hardware + addresses resulting from bit-ordering problems in a mixed-media, + bridged network) allowing for serial or concurrent reuse of a + hardware address on different subnets, and for hardware addresses + that may not be globally unique. Alternately, the key might be the + pair (IP-subnet-number, hostname), allowing the server to assign + parameters intelligently to a DHCP client that has been moved to a + different subnet or has changed hardware addresses (perhaps because + the network interface failed and was replaced). The protocol defines + that the key will be (IP-subnet-number, hardware-address) unless the + client explicitly supplies an identifier using the 'client + identifier' option. A client can query the DHCP service to + retrieve its configuration parameters. The client interface to the + configuration parameters repository consists of protocol messages to + request configuration parameters and responses from the server + carrying the configuration parameters. + +2.2 Dynamic allocation of network addresses + + The second service provided by DHCP is the allocation of temporary or + permanent network (IP) addresses to clients. The basic mechanism for + the dynamic allocation of network addresses is simple: a client + requests the use of an address for some period of time. The + allocation mechanism (the collection of DHCP servers) guarantees not + to reallocate that address within the requested time and attempts to + return the same network address each time the client requests an + address. In this document, the period over which a network address + is allocated to a client is referred to as a "lease" [11]. The + client may extend its lease with subsequent requests. The client may + issue a message to release the address back to the server when the + client no longer needs the address. The client may ask for a + permanent assignment by asking for an infinite lease. Even when + assigning "permanent" addresses, a server may choose to give out + lengthy but non-infinite leases to allow detection of the fact that + the client has been retired. + + In some environments it will be necessary to reassign network + addresses due to exhaustion of available addresses. In such + environments, the allocation mechanism will reuse addresses whose + lease has expired. The server should use whatever information is + available in the configuration information repository to choose an + address to reuse. For example, the server may choose the least + recently assigned address. As a consistency check, the allocating + server SHOULD probe the reused address before allocating the address, + e.g., with an ICMP echo request, and the client SHOULD probe the + newly received address, e.g., with ARP. + + + + + +Droms Standards Track [Page 12] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +3. The Client-Server Protocol + + DHCP uses the BOOTP message format defined in RFC 951 and given in + table 1 and figure 1. The 'op' field of each DHCP message sent from + a client to a server contains BOOTREQUEST. BOOTREPLY is used in the + 'op' field of each DHCP message sent from a server to a client. + + The first four octets of the 'options' field of the DHCP message + contain the (decimal) values 99, 130, 83 and 99, respectively (this + is the same magic cookie as is defined in RFC 1497 [17]). The + remainder of the 'options' field consists of a list of tagged + parameters that are called "options". All of the "vendor extensions" + listed in RFC 1497 are also DHCP options. RFC 1533 gives the + complete set of options defined for use with DHCP. + + Several options have been defined so far. One particular option - + the "DHCP message type" option - must be included in every DHCP + message. This option defines the "type" of the DHCP message. + Additional options may be allowed, required, or not allowed, + depending on the DHCP message type. + + Throughout this document, DHCP messages that include a 'DHCP message + type' option will be referred to by the type of the message; e.g., a + DHCP message with 'DHCP message type' option type 1 will be referred + to as a "DHCPDISCOVER" message. + +3.1 Client-server interaction - allocating a network address + + The following summary of the protocol exchanges between clients and + servers refers to the DHCP messages described in table 2. The + timeline diagram in figure 3 shows the timing relationships in a + typical client-server interaction. If the client already knows its + address, some steps may be omitted; this abbreviated interaction is + described in section 3.2. + + 1. The client broadcasts a DHCPDISCOVER message on its local physical + subnet. The DHCPDISCOVER message MAY include options that suggest + values for the network address and lease duration. BOOTP relay + agents may pass the message on to DHCP servers not on the same + physical subnet. + + 2. Each server may respond with a DHCPOFFER message that includes an + available network address in the 'yiaddr' field (and other + configuration parameters in DHCP options). Servers need not + reserve the offered network address, although the protocol will + work more efficiently if the server avoids allocating the offered + network address to another client. When allocating a new address, + servers SHOULD check that the offered network address is not + + + +Droms Standards Track [Page 13] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + already in use; e.g., the server may probe the offered address + with an ICMP Echo Request. Servers SHOULD be implemented so that + network administrators MAY choose to disable probes of newly + allocated addresses. The server transmits the DHCPOFFER message + to the client, using the BOOTP relay agent if necessary. + + Message Use + ------- --- + + DHCPDISCOVER - Client broadcast to locate available servers. + + DHCPOFFER - Server to client in response to DHCPDISCOVER with + offer of configuration parameters. + + DHCPREQUEST - Client message to servers either (a) requesting + offered parameters from one server and implicitly + declining offers from all others, (b) confirming + correctness of previously allocated address after, + e.g., system reboot, or (c) extending the lease on a + particular network address. + + DHCPACK - Server to client with configuration parameters, + including committed network address. + + DHCPNAK - Server to client indicating client's notion of network + address is incorrect (e.g., client has moved to new + subnet) or client's lease as expired + + DHCPDECLINE - Client to server indicating network address is already + in use. + + DHCPRELEASE - Client to server relinquishing network address and + cancelling remaining lease. + + DHCPINFORM - Client to server, asking only for local configuration + parameters; client already has externally configured + network address. + + Table 2: DHCP messages + + + + + + + + + + + + +Droms Standards Track [Page 14] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + Server Client Server + (not selected) (selected) + + v v v + | | | + | Begins initialization | + | | | + | _____________/|\____________ | + |/DHCPDISCOVER | DHCPDISCOVER \| + | | | + Determines | Determines + configuration | configuration + | | | + |\ | ____________/ | + | \________ | /DHCPOFFER | + | DHCPOFFER\ |/ | + | \ | | + | Collects replies | + | \| | + | Selects configuration | + | | | + | _____________/|\____________ | + |/ DHCPREQUEST | DHCPREQUEST\ | + | | | + | | Commits configuration + | | | + | | _____________/| + | |/ DHCPACK | + | | | + | Initialization complete | + | | | + . . . + . . . + | | | + | Graceful shutdown | + | | | + | |\ ____________ | + | | DHCPRELEASE \| + | | | + | | Discards lease + | | | + v v v + Figure 3: Timeline diagram of messages exchanged between DHCP + client and servers when allocating a new network address + + + + + + + +Droms Standards Track [Page 15] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + 3. The client receives one or more DHCPOFFER messages from one or more + servers. The client may choose to wait for multiple responses. + The client chooses one server from which to request configuration + parameters, based on the configuration parameters offered in the + DHCPOFFER messages. The client broadcasts a DHCPREQUEST message + that MUST include the 'server identifier' option to indicate which + server it has selected, and that MAY include other options + specifying desired configuration values. The 'requested IP + address' option MUST be set to the value of 'yiaddr' in the + DHCPOFFER message from the server. This DHCPREQUEST message is + broadcast and relayed through DHCP/BOOTP relay agents. To help + ensure that any BOOTP relay agents forward the DHCPREQUEST message + to the same set of DHCP servers that received the original + DHCPDISCOVER message, the DHCPREQUEST message MUST use the same + value in the DHCP message header's 'secs' field and be sent to the + same IP broadcast address as the original DHCPDISCOVER message. + The client times out and retransmits the DHCPDISCOVER message if + the client receives no DHCPOFFER messages. + + 4. The servers receive the DHCPREQUEST broadcast from the client. + Those servers not selected by the DHCPREQUEST message use the + message as notification that the client has declined that server's + offer. The server selected in the DHCPREQUEST message commits the + binding for the client to persistent storage and responds with a + DHCPACK message containing the configuration parameters for the + requesting client. The combination of 'client identifier' or + 'chaddr' and assigned network address constitute a unique + identifier for the client's lease and are used by both the client + and server to identify a lease referred to in any DHCP messages. + Any configuration parameters in the DHCPACK message SHOULD NOT + conflict with those in the earlier DHCPOFFER message to which the + client is responding. The server SHOULD NOT check the offered + network address at this point. The 'yiaddr' field in the DHCPACK + messages is filled in with the selected network address. + + If the selected server is unable to satisfy the DHCPREQUEST message + (e.g., the requested network address has been allocated), the + server SHOULD respond with a DHCPNAK message. + + A server MAY choose to mark addresses offered to clients in + DHCPOFFER messages as unavailable. The server SHOULD mark an + address offered to a client in a DHCPOFFER message as available if + the server receives no DHCPREQUEST message from that client. + + 5. The client receives the DHCPACK message with configuration + parameters. The client SHOULD perform a final check on the + parameters (e.g., ARP for allocated network address), and notes the + duration of the lease specified in the DHCPACK message. At this + + + +Droms Standards Track [Page 16] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + point, the client is configured. If the client detects that the + address is already in use (e.g., through the use of ARP), the + client MUST send a DHCPDECLINE message to the server and restarts + the configuration process. The client SHOULD wait a minimum of ten + seconds before restarting the configuration process to avoid + excessive network traffic in case of looping. + + If the client receives a DHCPNAK message, the client restarts the + configuration process. + + The client times out and retransmits the DHCPREQUEST message if the + client receives neither a DHCPACK or a DHCPNAK message. The client + retransmits the DHCPREQUEST according to the retransmission + algorithm in section 4.1. The client should choose to retransmit + the DHCPREQUEST enough times to give adequate probability of + contacting the server without causing the client (and the user of + that client) to wait overly long before giving up; e.g., a client + retransmitting as described in section 4.1 might retransmit the + DHCPREQUEST message four times, for a total delay of 60 seconds, + before restarting the initialization procedure. If the client + receives neither a DHCPACK or a DHCPNAK message after employing the + retransmission algorithm, the client reverts to INIT state and + restarts the initialization process. The client SHOULD notify the + user that the initialization process has failed and is restarting. + + 6. The client may choose to relinquish its lease on a network address + by sending a DHCPRELEASE message to the server. The client + identifies the lease to be released with its 'client identifier', + or 'chaddr' and network address in the DHCPRELEASE message. If the + client used a 'client identifier' when it obtained the lease, it + MUST use the same 'client identifier' in the DHCPRELEASE message. + +3.2 Client-server interaction - reusing a previously allocated network + address + + If a client remembers and wishes to reuse a previously allocated + network address, a client may choose to omit some of the steps + described in the previous section. The timeline diagram in figure 4 + shows the timing relationships in a typical client-server interaction + for a client reusing a previously allocated network address. + + + + + + + + + + + +Droms Standards Track [Page 17] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + 1. The client broadcasts a DHCPREQUEST message on its local subnet. + The message includes the client's network address in the + 'requested IP address' option. As the client has not received its + network address, it MUST NOT fill in the 'ciaddr' field. BOOTP + relay agents pass the message on to DHCP servers not on the same + subnet. If the client used a 'client identifier' to obtain its + address, the client MUST use the same 'client identifier' in the + DHCPREQUEST message. + + 2. Servers with knowledge of the client's configuration parameters + respond with a DHCPACK message to the client. Servers SHOULD NOT + check that the client's network address is already in use; the + client may respond to ICMP Echo Request messages at this point. + + Server Client Server + + v v v + | | | + | Begins | + | initialization | + | | | + | /|\ | + | _________ __/ | \__________ | + | /DHCPREQU EST | DHCPREQUEST\ | + |/ | \| + | | | + Locates | Locates + configuration | configuration + | | | + |\ | /| + | \ | ___________/ | + | \ | / DHCPACK | + | \ _______ |/ | + | DHCPACK\ | | + | Initialization | + | complete | + | \| | + | | | + | (Subsequent | + | DHCPACKS | + | ignored) | + | | | + | | | + v v v + + Figure 4: Timeline diagram of messages exchanged between DHCP + client and servers when reusing a previously allocated + network address + + + +Droms Standards Track [Page 18] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + If the client's request is invalid (e.g., the client has moved + to a new subnet), servers SHOULD respond with a DHCPNAK message to + the client. Servers SHOULD NOT respond if their information is not + guaranteed to be accurate. For example, a server that identifies a + request for an expired binding that is owned by another server SHOULD + NOT respond with a DHCPNAK unless the servers are using an explicit + mechanism to maintain coherency among the servers. + + If 'giaddr' is 0x0 in the DHCPREQUEST message, the client is on + the same subnet as the server. The server MUST + broadcast the DHCPNAK message to the 0xffffffff broadcast address + because the client may not have a correct network address or subnet + mask, and the client may not be answering ARP requests. + Otherwise, the server MUST send the DHCPNAK message to the IP + address of the BOOTP relay agent, as recorded in 'giaddr'. The + relay agent will, in turn, forward the message directly to the + client's hardware address, so that the DHCPNAK can be delivered even + if the client has moved to a new network. + + 3. The client receives the DHCPACK message with configuration + parameters. The client performs a final check on the parameters + (as in section 3.1), and notes the duration of the lease specified + in the DHCPACK message. The specific lease is implicitly identified + by the 'client identifier' or 'chaddr' and the network address. At + this point, the client is configured. + + If the client detects that the IP address in the DHCPACK message + is already in use, the client MUST send a DHCPDECLINE message to the + server and restarts the configuration process by requesting a + new network address. This action corresponds to the client + moving to the INIT state in the DHCP state diagram, which is + described in section 4.4. + + If the client receives a DHCPNAK message, it cannot reuse its + remembered network address. It must instead request a new + address by restarting the configuration process, this time + using the (non-abbreviated) procedure described in section + 3.1. This action also corresponds to the client moving to + the INIT state in the DHCP state diagram. + + The client times out and retransmits the DHCPREQUEST message if + the client receives neither a DHCPACK nor a DHCPNAK message. The + client retransmits the DHCPREQUEST according to the retransmission + algorithm in section 4.1. The client should choose to retransmit + the DHCPREQUEST enough times to give adequate probability of + contacting the server without causing the client (and the user of + that client) to wait overly long before giving up; e.g., a client + retransmitting as described in section 4.1 might retransmit the + + + +Droms Standards Track [Page 19] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + DHCPREQUEST message four times, for a total delay of 60 seconds, + before restarting the initialization procedure. If the client + receives neither a DHCPACK or a DHCPNAK message after employing + the retransmission algorithm, the client MAY choose to use the + previously allocated network address and configuration parameters + for the remainder of the unexpired lease. This corresponds to + moving to BOUND state in the client state transition diagram shown + in figure 5. + + 4. The client may choose to relinquish its lease on a network + address by sending a DHCPRELEASE message to the server. The + client identifies the lease to be released with its + 'client identifier', or 'chaddr' and network address in the + DHCPRELEASE message. + + Note that in this case, where the client retains its network + address locally, the client will not normally relinquish its + lease during a graceful shutdown. Only in the case where the + client explicitly needs to relinquish its lease, e.g., the client + is about to be moved to a different subnet, will the client send + a DHCPRELEASE message. + +3.3 Interpretation and representation of time values + + A client acquires a lease for a network address for a fixed period of + time (which may be infinite). Throughout the protocol, times are to + be represented in units of seconds. The time value of 0xffffffff is + reserved to represent "infinity". + + As clients and servers may not have synchronized clocks, times are + represented in DHCP messages as relative times, to be interpreted + with respect to the client's local clock. Representing relative + times in units of seconds in an unsigned 32 bit word gives a range of + relative times from 0 to approximately 100 years, which is sufficient + for the relative times to be measured using DHCP. + + The algorithm for lease duration interpretation given in the previous + paragraph assumes that client and server clocks are stable relative + to each other. If there is drift between the two clocks, the server + may consider the lease expired before the client does. To + compensate, the server may return a shorter lease duration to the + client than the server commits to its local database of client + information. + +3.4 Obtaining parameters with externally configured network address + + If a client has obtained a network address through some other means + (e.g., manual configuration), it may use a DHCPINFORM request message + + + +Droms Standards Track [Page 20] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + to obtain other local configuration parameters. Servers receiving a + DHCPINFORM message construct a DHCPACK message with any local + configuration parameters appropriate for the client without: + allocating a new address, checking for an existing binding, filling + in 'yiaddr' or including lease time parameters. The servers SHOULD + unicast the DHCPACK reply to the address given in the 'ciaddr' field + of the DHCPINFORM message. + + The server SHOULD check the network address in a DHCPINFORM message + for consistency, but MUST NOT check for an existing lease. The + server forms a DHCPACK message containing the configuration + parameters for the requesting client and sends the DHCPACK message + directly to the client. + +3.5 Client parameters in DHCP + + Not all clients require initialization of all parameters listed in + Appendix A. Two techniques are used to reduce the number of + parameters transmitted from the server to the client. First, most of + the parameters have defaults defined in the Host Requirements RFCs; + if the client receives no parameters from the server that override + the defaults, a client uses those default values. Second, in its + initial DHCPDISCOVER or DHCPREQUEST message, a client may provide the + server with a list of specific parameters the client is interested + in. If the client includes a list of parameters in a DHCPDISCOVER + message, it MUST include that list in any subsequent DHCPREQUEST + messages. + + The client SHOULD include the 'maximum DHCP message size' option to + let the server know how large the server may make its DHCP messages. + The parameters returned to a client may still exceed the space + allocated to options in a DHCP message. In this case, two additional + options flags (which must appear in the 'options' field of the + message) indicate that the 'file' and 'sname' fields are to be used + for options. + + The client can inform the server which configuration parameters the + client is interested in by including the 'parameter request list' + option. The data portion of this option explicitly lists the options + requested by tag number. + + In addition, the client may suggest values for the network address + and lease time in the DHCPDISCOVER message. The client may include + the 'requested IP address' option to suggest that a particular IP + address be assigned, and may include the 'IP address lease time' + option to suggest the lease time it would like. Other options + representing "hints" at configuration parameters are allowed in a + DHCPDISCOVER or DHCPREQUEST message. However, additional options may + + + +Droms Standards Track [Page 21] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + be ignored by servers, and multiple servers may, therefore, not + return identical values for some options. The 'requested IP address' + option is to be filled in only in a DHCPREQUEST message when the + client is verifying network parameters obtained previously. The + client fills in the 'ciaddr' field only when correctly configured + with an IP address in BOUND, RENEWING or REBINDING state. + + If a server receives a DHCPREQUEST message with an invalid 'requested + IP address', the server SHOULD respond to the client with a DHCPNAK + message and may choose to report the problem to the system + administrator. The server may include an error message in the + 'message' option. + +3.6 Use of DHCP in clients with multiple interfaces + + A client with multiple network interfaces must use DHCP through each + interface independently to obtain configuration information + parameters for those separate interfaces. + +3.7 When clients should use DHCP + + A client SHOULD use DHCP to reacquire or verify its IP address and + network parameters whenever the local network parameters may have + changed; e.g., at system boot time or after a disconnection from the + local network, as the local network configuration may change without + the client's or user's knowledge. + + If a client has knowledge of a previous network address and is unable + to contact a local DHCP server, the client may continue to use the + previous network address until the lease for that address expires. + If the lease expires before the client can contact a DHCP server, the + client must immediately discontinue use of the previous network + address and may inform local users of the problem. + +4. Specification of the DHCP client-server protocol + + In this section, we assume that a DHCP server has a block of network + addresses from which it can satisfy requests for new addresses. Each + server also maintains a database of allocated addresses and leases in + local permanent storage. + +4.1 Constructing and sending DHCP messages + + DHCP clients and servers both construct DHCP messages by filling in + fields in the fixed format section of the message and appending + tagged data items in the variable length option area. The options + area includes first a four-octet 'magic cookie' (which was described + in section 3), followed by the options. The last option must always + + + +Droms Standards Track [Page 22] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + be the 'end' option. + + DHCP uses UDP as its transport protocol. DHCP messages from a client + to a server are sent to the 'DHCP server' port (67), and DHCP + messages from a server to a client are sent to the 'DHCP client' port + (68). A server with multiple network address (e.g., a multi-homed + host) MAY use any of its network addresses in outgoing DHCP messages. + + The 'server identifier' field is used both to identify a DHCP server + in a DHCP message and as a destination address from clients to + servers. A server with multiple network addresses MUST be prepared + to to accept any of its network addresses as identifying that server + in a DHCP message. To accommodate potentially incomplete network + connectivity, a server MUST choose an address as a 'server + identifier' that, to the best of the server's knowledge, is reachable + from the client. For example, if the DHCP server and the DHCP client + are connected to the same subnet (i.e., the 'giaddr' field in the + message from the client is zero), the server SHOULD select the IP + address the server is using for communication on that subnet as the + 'server identifier'. If the server is using multiple IP addresses on + that subnet, any such address may be used. If the server has + received a message through a DHCP relay agent, the server SHOULD + choose an address from the interface on which the message was + recieved as the 'server identifier' (unless the server has other, + better information on which to make its choice). DHCP clients MUST + use the IP address provided in the 'server identifier' option for any + unicast requests to the DHCP server. + + DHCP messages broadcast by a client prior to that client obtaining + its IP address must have the source address field in the IP header + set to 0. + + If the 'giaddr' field in a DHCP message from a client is non-zero, + the server sends any return messages to the 'DHCP server' port on the + BOOTP relay agent whose address appears in 'giaddr'. If the 'giaddr' + field is zero and the 'ciaddr' field is nonzero, then the server + unicasts DHCPOFFER and DHCPACK messages to the address in 'ciaddr'. + If 'giaddr' is zero and 'ciaddr' is zero, and the broadcast bit is + set, then the server broadcasts DHCPOFFER and DHCPACK messages to + 0xffffffff. If the broadcast bit is not set and 'giaddr' is zero and + 'ciaddr' is zero, then the server unicasts DHCPOFFER and DHCPACK + messages to the client's hardware address and 'yiaddr' address. In + all cases, when 'giaddr' is zero, the server broadcasts any DHCPNAK + messages to 0xffffffff. + + If the options in a DHCP message extend into the 'sname' and 'file' + fields, the 'option overload' option MUST appear in the 'options' + field, with value 1, 2 or 3, as specified in RFC 1533. If the + + + +Droms Standards Track [Page 23] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + 'option overload' option is present in the 'options' field, the + options in the 'options' field MUST be terminated by an 'end' option, + and MAY contain one or more 'pad' options to fill the options field. + The options in the 'sname' and 'file' fields (if in use as indicated + by the 'options overload' option) MUST begin with the first octet of + the field, MUST be terminated by an 'end' option, and MUST be + followed by 'pad' options to fill the remainder of the field. Any + individual option in the 'options', 'sname' and 'file' fields MUST be + entirely contained in that field. The options in the 'options' field + MUST be interpreted first, so that any 'option overload' options may + be interpreted. The 'file' field MUST be interpreted next (if the + 'option overload' option indicates that the 'file' field contains + DHCP options), followed by the 'sname' field. + + The values to be passed in an 'option' tag may be too long to fit in + the 255 octets available to a single option (e.g., a list of routers + in a 'router' option [21]). Options may appear only once, unless + otherwise specified in the options document. The client concatenates + the values of multiple instances of the same option into a single + parameter list for configuration. + + DHCP clients are responsible for all message retransmission. The + client MUST adopt a retransmission strategy that incorporates a + randomized exponential backoff algorithm to determine the delay + between retransmissions. The delay between retransmissions SHOULD be + chosen to allow sufficient time for replies from the server to be + delivered based on the characteristics of the internetwork between + the client and the server. For example, in a 10Mb/sec Ethernet + internetwork, the delay before the first retransmission SHOULD be 4 + seconds randomized by the value of a uniform random number chosen + from the range -1 to +1. Clients with clocks that provide resolution + granularity of less than one second may choose a non-integer + randomization value. The delay before the next retransmission SHOULD + be 8 seconds randomized by the value of a uniform number chosen from + the range -1 to +1. The retransmission delay SHOULD be doubled with + subsequent retransmissions up to a maximum of 64 seconds. The client + MAY provide an indication of retransmission attempts to the user as + an indication of the progress of the configuration process. + + The 'xid' field is used by the client to match incoming DHCP messages + with pending requests. A DHCP client MUST choose 'xid's in such a + way as to minimize the chance of using an 'xid' identical to one used + by another client. For example, a client may choose a different, + random initial 'xid' each time the client is rebooted, and + subsequently use sequential 'xid's until the next reboot. Selecting + a new 'xid' for each retransmission is an implementation decision. A + client may choose to reuse the same 'xid' or select a new 'xid' for + each retransmitted message. + + + +Droms Standards Track [Page 24] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + Normally, DHCP servers and BOOTP relay agents attempt to deliver + DHCPOFFER, DHCPACK and DHCPNAK messages directly to the client using + uicast delivery. The IP destination address (in the IP header) is + set to the DHCP 'yiaddr' address and the link-layer destination + address is set to the DHCP 'chaddr' address. Unfortunately, some + client implementations are unable to receive such unicast IP + datagrams until the implementation has been configured with a valid + IP address (leading to a deadlock in which the client's IP address + cannot be delivered until the client has been configured with an IP + address). + + A client that cannot receive unicast IP datagrams until its protocol + software has been configured with an IP address SHOULD set the + BROADCAST bit in the 'flags' field to 1 in any DHCPDISCOVER or + DHCPREQUEST messages that client sends. The BROADCAST bit will + provide a hint to the DHCP server and BOOTP relay agent to broadcast + any messages to the client on the client's subnet. A client that can + receive unicast IP datagrams before its protocol software has been + configured SHOULD clear the BROADCAST bit to 0. The BOOTP + clarifications document discusses the ramifications of the use of the + BROADCAST bit [21]. + + A server or relay agent sending or relaying a DHCP message directly + to a DHCP client (i.e., not to a relay agent specified in the + 'giaddr' field) SHOULD examine the BROADCAST bit in the 'flags' + field. If this bit is set to 1, the DHCP message SHOULD be sent as + an IP broadcast using an IP broadcast address (preferably 0xffffffff) + as the IP destination address and the link-layer broadcast address as + the link-layer destination address. If the BROADCAST bit is cleared + to 0, the message SHOULD be sent as an IP unicast to the IP address + specified in the 'yiaddr' field and the link-layer address specified + in the 'chaddr' field. If unicasting is not possible, the message + MAY be sent as an IP broadcast using an IP broadcast address + (preferably 0xffffffff) as the IP destination address and the link- + layer broadcast address as the link-layer destination address. + +4.2 DHCP server administrative controls + + DHCP servers are not required to respond to every DHCPDISCOVER and + DHCPREQUEST message they receive. For example, a network + administrator, to retain stringent control over the clients attached + to the network, may choose to configure DHCP servers to respond only + to clients that have been previously registered through some external + mechanism. The DHCP specification describes only the interactions + between clients and servers when the clients and servers choose to + interact; it is beyond the scope of the DHCP specification to + describe all of the administrative controls that system + administrators might want to use. Specific DHCP server + + + +Droms Standards Track [Page 25] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + implementations may incorporate any controls or policies desired by a + network administrator. + + In some environments, a DHCP server will have to consider the values + of the vendor class options included in DHCPDISCOVER or DHCPREQUEST + messages when determining the correct parameters for a particular + client. + + A DHCP server needs to use some unique identifier to associate a + client with its lease. The client MAY choose to explicitly provide + the identifier through the 'client identifier' option. If the client + supplies a 'client identifier', the client MUST use the same 'client + identifier' in all subsequent messages, and the server MUST use that + identifier to identify the client. If the client does not provide a + 'client identifier' option, the server MUST use the contents of the + 'chaddr' field to identify the client. It is crucial for a DHCP + client to use an identifier unique within the subnet to which the + client is attached in the 'client identifier' option. Use of + 'chaddr' as the client's unique identifier may cause unexpected + results, as that identifier may be associated with a hardware + interface that could be moved to a new client. Some sites may choose + to use a manufacturer's serial number as the 'client identifier', to + avoid unexpected changes in a clients network address due to transfer + of hardware interfaces among computers. Sites may also choose to use + a DNS name as the 'client identifier', causing address leases to be + associated with the DNS name rather than a specific hardware box. + + DHCP clients are free to use any strategy in selecting a DHCP server + among those from which the client receives a DHCPOFFER message. The + client implementation of DHCP SHOULD provide a mechanism for the user + to select directly the 'vendor class identifier' values. + +4.3 DHCP server behavior + + A DHCP server processes incoming DHCP messages from a client based on + the current state of the binding for that client. A DHCP server can + receive the following messages from a client: + + o DHCPDISCOVER + + o DHCPREQUEST + + o DHCPDECLINE + + o DHCPRELEASE + + o DHCPINFORM + + + + +Droms Standards Track [Page 26] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + Table 3 gives the use of the fields and options in a DHCP message by + a server. The remainder of this section describes the action of the + DHCP server for each possible incoming message. + +4.3.1 DHCPDISCOVER message + + When a server receives a DHCPDISCOVER message from a client, the + server chooses a network address for the requesting client. If no + address is available, the server may choose to report the problem to + the system administrator. If an address is available, the new address + SHOULD be chosen as follows: + + o The client's current address as recorded in the client's current + binding, ELSE + + o The client's previous address as recorded in the client's (now + expired or released) binding, if that address is in the server's + pool of available addresses and not already allocated, ELSE + + o The address requested in the 'Requested IP Address' option, if that + address is valid and not already allocated, ELSE + + o A new address allocated from the server's pool of available + addresses; the address is selected based on the subnet from which + the message was received (if 'giaddr' is 0) or on the address of + the relay agent that forwarded the message ('giaddr' when not 0). + + As described in section 4.2, a server MAY, for administrative + reasons, assign an address other than the one requested, or may + refuse to allocate an address to a particular client even though free + addresses are available. + + Note that, in some network architectures (e.g., internets with more + than one IP subnet assigned to a physical network segment), it may be + the case that the DHCP client should be assigned an address from a + different subnet than the address recorded in 'giaddr'. Thus, DHCP + does not require that the client be assigned as address from the + subnet in 'giaddr'. A server is free to choose some other subnet, + and it is beyond the scope of the DHCP specification to describe ways + in which the assigned IP address might be chosen. + + While not required for correct operation of DHCP, the server SHOULD + NOT reuse the selected network address before the client responds to + the server's DHCPOFFER message. The server may choose to record the + address as offered to the client. + + The server must also choose an expiration time for the lease, as + follows: + + + +Droms Standards Track [Page 27] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + o IF the client has not requested a specific lease in the + DHCPDISCOVER message and the client already has an assigned network + address, the server returns the lease expiration time previously + assigned to that address (note that the client must explicitly + request a specific lease to extend the expiration time on a + previously assigned address), ELSE + + o IF the client has not requested a specific lease in the + DHCPDISCOVER message and the client does not have an assigned + network address, the server assigns a locally configured default + lease time, ELSE + + o IF the client has requested a specific lease in the DHCPDISCOVER + message (regardless of whether the client has an assigned network + address), the server may choose either to return the requested + lease (if the lease is acceptable to local policy) or select + another lease. + +Field DHCPOFFER DHCPACK DHCPNAK +----- --------- ------- ------- +'op' BOOTREPLY BOOTREPLY BOOTREPLY +'htype' (From "Assigned Numbers" RFC) +'hlen' (Hardware address length in octets) +'hops' 0 0 0 +'xid' 'xid' from client 'xid' from client 'xid' from client + DHCPDISCOVER DHCPREQUEST DHCPREQUEST + message message message +'secs' 0 0 0 +'ciaddr' 0 'ciaddr' from 0 + DHCPREQUEST or 0 +'yiaddr' IP address offered IP address 0 + to client assigned to client +'siaddr' IP address of next IP address of next 0 + bootstrap server bootstrap server +'flags' 'flags' from 'flags' from 'flags' from + client DHCPDISCOVER client DHCPREQUEST client DHCPREQUEST + message message message +'giaddr' 'giaddr' from 'giaddr' from 'giaddr' from + client DHCPDISCOVER client DHCPREQUEST client DHCPREQUEST + message message message +'chaddr' 'chaddr' from 'chaddr' from 'chaddr' from + client DHCPDISCOVER client DHCPREQUEST client DHCPREQUEST + message message message +'sname' Server host name Server host name (unused) + or options or options +'file' Client boot file Client boot file (unused) + name or options name or options +'options' options options + + + +Droms Standards Track [Page 28] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +Option DHCPOFFER DHCPACK DHCPNAK +------ --------- ------- ------- +Requested IP address MUST NOT MUST NOT MUST NOT +IP address lease time MUST MUST (DHCPREQUEST) MUST NOT + MUST NOT (DHCPINFORM) +Use 'file'/'sname' fields MAY MAY MUST NOT +DHCP message type DHCPOFFER DHCPACK DHCPNAK +Parameter request list MUST NOT MUST NOT MUST NOT +Message SHOULD SHOULD SHOULD +Client identifier MUST NOT MUST NOT MAY +Vendor class identifier MAY MAY MAY +Server identifier MUST MUST MUST +Maximum message size MUST NOT MUST NOT MUST NOT +All others MAY MAY MUST NOT + + Table 3: Fields and options used by DHCP servers + + Once the network address and lease have been determined, the server + constructs a DHCPOFFER message with the offered configuration + parameters. It is important for all DHCP servers to return the same + parameters (with the possible exception of a newly allocated network + address) to ensure predictable client behavior regardless of which + server the client selects. The configuration parameters MUST be + selected by applying the following rules in the order given below. + The network administrator is responsible for configuring multiple + DHCP servers to ensure uniform responses from those servers. The + server MUST return to the client: + + o The client's network address, as determined by the rules given + earlier in this section, + + o The expiration time for the client's lease, as determined by the + rules given earlier in this section, + + o Parameters requested by the client, according to the following + rules: + + -- IF the server has been explicitly configured with a default + value for the parameter, the server MUST include that value + in an appropriate option in the 'option' field, ELSE + + -- IF the server recognizes the parameter as a parameter + defined in the Host Requirements Document, the server MUST + include the default value for that parameter as given in the + Host Requirements Document in an appropriate option in the + 'option' field, ELSE + + -- The server MUST NOT return a value for that parameter, + + + +Droms Standards Track [Page 29] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + The server MUST supply as many of the requested parameters as + possible and MUST omit any parameters it cannot provide. The + server MUST include each requested parameter only once unless + explicitly allowed in the DHCP Options and BOOTP Vendor + Extensions document. + + o Any parameters from the existing binding that differ from the Host + Requirements Document defaults, + + o Any parameters specific to this client (as identified by + the contents of 'chaddr' or 'client identifier' in the DHCPDISCOVER + or DHCPREQUEST message), e.g., as configured by the network + administrator, + + o Any parameters specific to this client's class (as identified + by the contents of the 'vendor class identifier' + option in the DHCPDISCOVER or DHCPREQUEST message), + e.g., as configured by the network administrator; the parameters + MUST be identified by an exact match between the client's vendor + class identifiers and the client's classes identified in the + server, + + o Parameters with non-default values on the client's subnet. + + The server MAY choose to return the 'vendor class identifier' used to + determine the parameters in the DHCPOFFER message to assist the + client in selecting which DHCPOFFER to accept. The server inserts + the 'xid' field from the DHCPDISCOVER message into the 'xid' field of + the DHCPOFFER message and sends the DHCPOFFER message to the + requesting client. + +4.3.2 DHCPREQUEST message + + A DHCPREQUEST message may come from a client responding to a + DHCPOFFER message from a server, from a client verifying a previously + allocated IP address or from a client extending the lease on a + network address. If the DHCPREQUEST message contains a 'server + identifier' option, the message is in response to a DHCPOFFER + message. Otherwise, the message is a request to verify or extend an + existing lease. If the client uses a 'client identifier' in a + DHCPREQUEST message, it MUST use that same 'client identifier' in all + subsequent messages. If the client included a list of requested + parameters in a DHCPDISCOVER message, it MUST include that list in + all subsequent messages. + + + + + + + +Droms Standards Track [Page 30] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + Any configuration parameters in the DHCPACK message SHOULD NOT + conflict with those in the earlier DHCPOFFER message to which the + client is responding. The client SHOULD use the parameters in the + DHCPACK message for configuration. + + Clients send DHCPREQUEST messages as follows: + + o DHCPREQUEST generated during SELECTING state: + + Client inserts the address of the selected server in 'server + identifier', 'ciaddr' MUST be zero, 'requested IP address' MUST be + filled in with the yiaddr value from the chosen DHCPOFFER. + + Note that the client may choose to collect several DHCPOFFER + messages and select the "best" offer. The client indicates its + selection by identifying the offering server in the DHCPREQUEST + message. If the client receives no acceptable offers, the client + may choose to try another DHCPDISCOVER message. Therefore, the + servers may not receive a specific DHCPREQUEST from which they can + decide whether or not the client has accepted the offer. Because + the servers have not committed any network address assignments on + the basis of a DHCPOFFER, servers are free to reuse offered + network addresses in response to subsequent requests. As an + implementation detail, servers SHOULD NOT reuse offered addresses + and may use an implementation-specific timeout mechanism to decide + when to reuse an offered address. + + o DHCPREQUEST generated during INIT-REBOOT state: + + 'server identifier' MUST NOT be filled in, 'requested IP address' + option MUST be filled in with client's notion of its previously + assigned address. 'ciaddr' MUST be zero. The client is seeking to + verify a previously allocated, cached configuration. Server SHOULD + send a DHCPNAK message to the client if the 'requested IP address' + is incorrect, or is on the wrong network. + + Determining whether a client in the INIT-REBOOT state is on the + correct network is done by examining the contents of 'giaddr', the + 'requested IP address' option, and a database lookup. If the DHCP + server detects that the client is on the wrong net (i.e., the + result of applying the local subnet mask or remote subnet mask (if + 'giaddr' is not zero) to 'requested IP address' option value + doesn't match reality), then the server SHOULD send a DHCPNAK + message to the client. + + + + + + + +Droms Standards Track [Page 31] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + If the network is correct, then the DHCP server should check if + the client's notion of its IP address is correct. If not, then the + server SHOULD send a DHCPNAK message to the client. If the DHCP + server has no record of this client, then it MUST remain silent, + and MAY output a warning to the network administrator. This + behavior is necessary for peaceful coexistence of non- + communicating DHCP servers on the same wire. + + If 'giaddr' is 0x0 in the DHCPREQUEST message, the client is on + the same subnet as the server. The server MUST broadcast the + DHCPNAK message to the 0xffffffff broadcast address because the + client may not have a correct network address or subnet mask, and + the client may not be answering ARP requests. + + If 'giaddr' is set in the DHCPREQUEST message, the client is on a + different subnet. The server MUST set the broadcast bit in the + DHCPNAK, so that the relay agent will broadcast the DHCPNAK to the + client, because the client may not have a correct network address + or subnet mask, and the client may not be answering ARP requests. + + o DHCPREQUEST generated during RENEWING state: + + 'server identifier' MUST NOT be filled in, 'requested IP address' + option MUST NOT be filled in, 'ciaddr' MUST be filled in with + client's IP address. In this situation, the client is completely + configured, and is trying to extend its lease. This message will + be unicast, so no relay agents will be involved in its + transmission. Because 'giaddr' is therefore not filled in, the + DHCP server will trust the value in 'ciaddr', and use it when + replying to the client. + + A client MAY choose to renew or extend its lease prior to T1. The + server may choose not to extend the lease (as a policy decision by + the network administrator), but should return a DHCPACK message + regardless. + + o DHCPREQUEST generated during REBINDING state: + + 'server identifier' MUST NOT be filled in, 'requested IP address' + option MUST NOT be filled in, 'ciaddr' MUST be filled in with + client's IP address. In this situation, the client is completely + configured, and is trying to extend its lease. This message MUST + be broadcast to the 0xffffffff IP broadcast address. The DHCP + server SHOULD check 'ciaddr' for correctness before replying to + the DHCPREQUEST. + + + + + + +Droms Standards Track [Page 32] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + The DHCPREQUEST from a REBINDING client is intended to accommodate + sites that have multiple DHCP servers and a mechanism for + maintaining consistency among leases managed by multiple servers. + A DHCP server MAY extend a client's lease only if it has local + administrative authority to do so. + +4.3.3 DHCPDECLINE message + + If the server receives a DHCPDECLINE message, the client has + discovered through some other means that the suggested network + address is already in use. The server MUST mark the network address + as not available and SHOULD notify the local system administrator of + a possible configuration problem. + +4.3.4 DHCPRELEASE message + + Upon receipt of a DHCPRELEASE message, the server marks the network + address as not allocated. The server SHOULD retain a record of the + client's initialization parameters for possible reuse in response to + subsequent requests from the client. + +4.3.5 DHCPINFORM message + + The server responds to a DHCPINFORM message by sending a DHCPACK + message directly to the address given in the 'ciaddr' field of the + DHCPINFORM message. The server MUST NOT send a lease expiration time + to the client and SHOULD NOT fill in 'yiaddr'. The server includes + other parameters in the DHCPACK message as defined in section 4.3.1. + +4.3.6 Client messages + + Table 4 details the differences between messages from clients in + various states. + + --------------------------------------------------------------------- + | |INIT-REBOOT |SELECTING |RENEWING |REBINDING | + --------------------------------------------------------------------- + |broad/unicast |broadcast |broadcast |unicast |broadcast | + |server-ip |MUST NOT |MUST |MUST NOT |MUST NOT | + |requested-ip |MUST |MUST |MUST NOT |MUST NOT | + |ciaddr |zero |zero |IP address |IP address| + --------------------------------------------------------------------- + + Table 4: Client messages from different states + + + + + + + +Droms Standards Track [Page 33] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +4.4 DHCP client behavior + + Figure 5 gives a state-transition diagram for a DHCP client. A + client can receive the following messages from a server: + + o DHCPOFFER + + o DHCPACK + + o DHCPNAK + + The DHCPINFORM message is not shown in figure 5. A client simply + sends the DHCPINFORM and waits for DHCPACK messages. Once the client + has selected its parameters, it has completed the configuration + process. + + Table 5 gives the use of the fields and options in a DHCP message by + a client. The remainder of this section describes the action of the + DHCP client for each possible incoming message. The description in + the following section corresponds to the full configuration procedure + previously described in section 3.1, and the text in the subsequent + section corresponds to the abbreviated configuration procedure + described in section 3.2. + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Droms Standards Track [Page 34] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + -------- ------- +| | +-------------------------->| |<-------------------+ +| INIT- | | +-------------------->| INIT | | +| REBOOT |DHCPNAK/ +---------->| |<---+ | +| |Restart| | ------- | | + -------- | DHCPNAK/ | | | + | Discard offer | -/Send DHCPDISCOVER | +-/Send DHCPREQUEST | | | + | | | DHCPACK v | | + ----------- | (not accept.)/ ----------- | | +| | | Send DHCPDECLINE | | | +| REBOOTING | | | | SELECTING |<----+ | +| | | / | | |DHCPOFFER/ | + ----------- | / ----------- | |Collect | + | | / | | | replies | +DHCPACK/ | / +----------------+ +-------+ | +Record lease, set| | v Select offer/ | +timers T1, T2 ------------ send DHCPREQUEST | | + | +----->| | DHCPNAK, Lease expired/ | + | | | REQUESTING | Halt network | + DHCPOFFER/ | | | | + Discard ------------ | | + | | | | ----------- | + | +--------+ DHCPACK/ | | | + | Record lease, set -----| REBINDING | | + | timers T1, T2 / | | | + | | DHCPACK/ ----------- | + | v Record lease, set ^ | + +----------------> ------- /timers T1,T2 | | + +----->| |<---+ | | + | | BOUND |<---+ | | + DHCPOFFER, DHCPACK, | | | T2 expires/ DHCPNAK/ + DHCPNAK/Discard ------- | Broadcast Halt network + | | | | DHCPREQUEST | + +-------+ | DHCPACK/ | | + T1 expires/ Record lease, set | | + Send DHCPREQUEST timers T1, T2 | | + to leasing server | | | + | ---------- | | + | | |------------+ | + +->| RENEWING | | + | |----------------------------+ + ---------- + Figure 5: State-transition diagram for DHCP clients + + + + + + + +Droms Standards Track [Page 35] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +4.4.1 Initialization and allocation of network address + + The client begins in INIT state and forms a DHCPDISCOVER message. + The client SHOULD wait a random time between one and ten seconds to + desynchronize the use of DHCP at startup. The client sets 'ciaddr' + to 0x00000000. The client MAY request specific parameters by + including the 'parameter request list' option. The client MAY + suggest a network address and/or lease time by including the + 'requested IP address' and 'IP address lease time' options. The + client MUST include its hardware address in the 'chaddr' field, if + necessary for delivery of DHCP reply messages. The client MAY + include a different unique identifier in the 'client identifier' + option, as discussed in section 4.2. If the client included a list + of requested parameters in a DHCPDISCOVER message, it MUST include + that list in all subsequent messages. + + The client generates and records a random transaction identifier and + inserts that identifier into the 'xid' field. The client records its + own local time for later use in computing the lease expiration. The + client then broadcasts the DHCPDISCOVER on the local hardware + broadcast address to the 0xffffffff IP broadcast address and 'DHCP + server' UDP port. + + If the 'xid' of an arriving DHCPOFFER message does not match the + 'xid' of the most recent DHCPDISCOVER message, the DHCPOFFER message + must be silently discarded. Any arriving DHCPACK messages must be + silently discarded. + + The client collects DHCPOFFER messages over a period of time, selects + one DHCPOFFER message from the (possibly many) incoming DHCPOFFER + messages (e.g., the first DHCPOFFER message or the DHCPOFFER message + from the previously used server) and extracts the server address from + the 'server identifier' option in the DHCPOFFER message. The time + over which the client collects messages and the mechanism used to + select one DHCPOFFER are implementation dependent. + + + + + + + + + + + + + + + + +Droms Standards Track [Page 36] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +Field DHCPDISCOVER DHCPREQUEST DHCPDECLINE, + DHCPINFORM DHCPRELEASE +----- ------------ ----------- ----------- +'op' BOOTREQUEST BOOTREQUEST BOOTREQUEST +'htype' (From "Assigned Numbers" RFC) +'hlen' (Hardware address length in octets) +'hops' 0 0 0 +'xid' selected by client 'xid' from server selected by + DHCPOFFER message client +'secs' 0 or seconds since 0 or seconds since 0 + DHCP process started DHCP process started +'flags' Set 'BROADCAST' Set 'BROADCAST' 0 + flag if client flag if client + requires broadcast requires broadcast + reply reply +'ciaddr' 0 (DHCPDISCOVER) 0 or client's 0 (DHCPDECLINE) + client's network address client's network + network address (BOUND/RENEW/REBIND) address + (DHCPINFORM) (DHCPRELEASE) +'yiaddr' 0 0 0 +'siaddr' 0 0 0 +'giaddr' 0 0 0 +'chaddr' client's hardware client's hardware client's hardware + address address address +'sname' options, if options, if (unused) + indicated in indicated in + 'sname/file' 'sname/file' + option; otherwise option; otherwise + unused unused +'file' options, if options, if (unused) + indicated in indicated in + 'sname/file' 'sname/file' + option; otherwise option; otherwise + unused unused +'options' options options (unused) + + + + + + + + + + + + + + + + +Droms Standards Track [Page 37] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +Option DHCPDISCOVER DHCPREQUEST DHCPDECLINE, + DHCPINFORM DHCPRELEASE +------ ------------ ----------- ----------- +Requested IP address MAY MUST (in MUST + (DISCOVER) SELECTING or (DHCPDECLINE), + MUST NOT INIT-REBOOT) MUST NOT + (INFORM) MUST NOT (in (DHCPRELEASE) + BOUND or + RENEWING) +IP address lease time MAY MAY MUST NOT + (DISCOVER) + MUST NOT + (INFORM) +Use 'file'/'sname' fields MAY MAY MAY +DHCP message type DHCPDISCOVER/ DHCPREQUEST DHCPDECLINE/ + DHCPINFORM DHCPRELEASE +Client identifier MAY MAY MAY +Vendor class identifier MAY MAY MUST NOT +Server identifier MUST NOT MUST (after MUST + SELECTING) + MUST NOT (after + INIT-REBOOT, + BOUND, RENEWING + or REBINDING) +Parameter request list MAY MAY MUST NOT +Maximum message size MAY MAY MUST NOT +Message SHOULD NOT SHOULD NOT SHOULD +Site-specific MAY MAY MUST NOT +All others MAY MAY MUST NOT + + Table 5: Fields and options used by DHCP clients + + If the parameters are acceptable, the client records the address of + the server that supplied the parameters from the 'server identifier' + field and sends that address in the 'server identifier' field of a + DHCPREQUEST broadcast message. Once the DHCPACK message from the + server arrives, the client is initialized and moves to BOUND state. + The DHCPREQUEST message contains the same 'xid' as the DHCPOFFER + message. The client records the lease expiration time as the sum of + the time at which the original request was sent and the duration of + the lease from the DHCPACK message. The client SHOULD perform a + check on the suggested address to ensure that the address is not + already in use. For example, if the client is on a network that + supports ARP, the client may issue an ARP request for the suggested + request. When broadcasting an ARP request for the suggested address, + the client must fill in its own hardware address as the sender's + hardware address, and 0 as the sender's IP address, to avoid + confusing ARP caches in other hosts on the same subnet. If the + + + +Droms Standards Track [Page 38] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + network address appears to be in use, the client MUST send a + DHCPDECLINE message to the server. The client SHOULD broadcast an ARP + reply to announce the client's new IP address and clear any outdated + ARP cache entries in hosts on the client's subnet. + +4.4.2 Initialization with known network address + + The client begins in INIT-REBOOT state and sends a DHCPREQUEST + message. The client MUST insert its known network address as a + 'requested IP address' option in the DHCPREQUEST message. The client + may request specific configuration parameters by including the + 'parameter request list' option. The client generates and records a + random transaction identifier and inserts that identifier into the + 'xid' field. The client records its own local time for later use in + computing the lease expiration. The client MUST NOT include a + 'server identifier' in the DHCPREQUEST message. The client then + broadcasts the DHCPREQUEST on the local hardware broadcast address to + the 'DHCP server' UDP port. + + Once a DHCPACK message with an 'xid' field matching that in the + client's DHCPREQUEST message arrives from any server, the client is + initialized and moves to BOUND state. The client records the lease + expiration time as the sum of the time at which the DHCPREQUEST + message was sent and the duration of the lease from the DHCPACK + message. + +4.4.3 Initialization with an externally assigned network address + + The client sends a DHCPINFORM message. The client may request + specific configuration parameters by including the 'parameter request + list' option. The client generates and records a random transaction + identifier and inserts that identifier into the 'xid' field. The + client places its own network address in the 'ciaddr' field. The + client SHOULD NOT request lease time parameters. + + The client then unicasts the DHCPINFORM to the DHCP server if it + knows the server's address, otherwise it broadcasts the message to + the limited (all 1s) broadcast address. DHCPINFORM messages MUST be + directed to the 'DHCP server' UDP port. + + Once a DHCPACK message with an 'xid' field matching that in the + client's DHCPINFORM message arrives from any server, the client is + initialized. + + If the client does not receive a DHCPACK within a reasonable period + of time (60 seconds or 4 tries if using timeout suggested in section + 4.1), then it SHOULD display a message informing the user of the + problem, and then SHOULD begin network processing using suitable + + + +Droms Standards Track [Page 39] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + defaults as per Appendix A. + +4.4.4 Use of broadcast and unicast + + The DHCP client broadcasts DHCPDISCOVER, DHCPREQUEST and DHCPINFORM + messages, unless the client knows the address of a DHCP server. The + client unicasts DHCPRELEASE messages to the server. Because the + client is declining the use of the IP address supplied by the server, + the client broadcasts DHCPDECLINE messages. + + When the DHCP client knows the address of a DHCP server, in either + INIT or REBOOTING state, the client may use that address in the + DHCPDISCOVER or DHCPREQUEST rather than the IP broadcast address. + The client may also use unicast to send DHCPINFORM messages to a + known DHCP server. If the client receives no response to DHCP + messages sent to the IP address of a known DHCP server, the DHCP + client reverts to using the IP broadcast address. + +4.4.5 Reacquisition and expiration + + The client maintains two times, T1 and T2, that specify the times at + which the client tries to extend its lease on its network address. + T1 is the time at which the client enters the RENEWING state and + attempts to contact the server that originally issued the client's + network address. T2 is the time at which the client enters the + REBINDING state and attempts to contact any server. T1 MUST be + earlier than T2, which, in turn, MUST be earlier than the time at + which the client's lease will expire. + + To avoid the need for synchronized clocks, T1 and T2 are expressed in + options as relative times [2]. + + At time T1 the client moves to RENEWING state and sends (via unicast) + a DHCPREQUEST message to the server to extend its lease. The client + sets the 'ciaddr' field in the DHCPREQUEST to its current network + address. The client records the local time at which the DHCPREQUEST + message is sent for computation of the lease expiration time. The + client MUST NOT include a 'server identifier' in the DHCPREQUEST + message. + + Any DHCPACK messages that arrive with an 'xid' that does not match + the 'xid' of the client's DHCPREQUEST message are silently discarded. + When the client receives a DHCPACK from the server, the client + computes the lease expiration time as the sum of the time at which + the client sent the DHCPREQUEST message and the duration of the lease + in the DHCPACK message. The client has successfully reacquired its + network address, returns to BOUND state and may continue network + processing. + + + +Droms Standards Track [Page 40] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + If no DHCPACK arrives before time T2, the client moves to REBINDING + state and sends (via broadcast) a DHCPREQUEST message to extend its + lease. The client sets the 'ciaddr' field in the DHCPREQUEST to its + current network address. The client MUST NOT include a 'server + identifier' in the DHCPREQUEST message. + + Times T1 and T2 are configurable by the server through options. T1 + defaults to (0.5 * duration_of_lease). T2 defaults to (0.875 * + duration_of_lease). Times T1 and T2 SHOULD be chosen with some + random "fuzz" around a fixed value, to avoid synchronization of + client reacquisition. + + A client MAY choose to renew or extend its lease prior to T1. The + server MAY choose to extend the client's lease according to policy + set by the network administrator. The server SHOULD return T1 and + T2, and their values SHOULD be adjusted from their original values to + take account of the time remaining on the lease. + + In both RENEWING and REBINDING states, if the client receives no + response to its DHCPREQUEST message, the client SHOULD wait one-half + of the remaining time until T2 (in RENEWING state) and one-half of + the remaining lease time (in REBINDING state), down to a minimum of + 60 seconds, before retransmitting the DHCPREQUEST message. + + If the lease expires before the client receives a DHCPACK, the client + moves to INIT state, MUST immediately stop any other network + processing and requests network initialization parameters as if the + client were uninitialized. If the client then receives a DHCPACK + allocating that client its previous network address, the client + SHOULD continue network processing. If the client is given a new + network address, it MUST NOT continue using the previous network + address and SHOULD notify the local users of the problem. + +4.4.6 DHCPRELEASE + + If the client no longer requires use of its assigned network address + (e.g., the client is gracefully shut down), the client sends a + DHCPRELEASE message to the server. Note that the correct operation + of DHCP does not depend on the transmission of DHCPRELEASE messages. + + + + + + + + + + + + +Droms Standards Track [Page 41] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +5. Acknowledgments + + The author thanks the many (and too numerous to mention!) members of + the DHC WG for their tireless and ongoing efforts in the development + of DHCP and this document. + + The efforts of J Allard, Mike Carney, Dave Lapp, Fred Lien and John + Mendonca in organizing DHCP interoperability testing sessions are + gratefully acknowledged. + + The development of this document was supported in part by grants from + the Corporation for National Research Initiatives (CNRI), Bucknell + University and Sun Microsystems. + +6. References + + [1] Acetta, M., "Resource Location Protocol", RFC 887, CMU, December + 1983. + + [2] Alexander, S., and R. Droms, "DHCP Options and BOOTP Vendor + Extensions", RFC 1533, Lachman Technology, Inc., Bucknell + University, October 1993. + + [3] Braden, R., Editor, "Requirements for Internet Hosts -- + Communication Layers", STD 3, RFC 1122, USC/Information Sciences + Institute, October 1989. + + [4] Braden, R., Editor, "Requirements for Internet Hosts -- + Application and Support, STD 3, RFC 1123, USC/Information + Sciences Institute, October 1989. + + [5] Brownell, D, "Dynamic Reverse Address Resolution Protocol + (DRARP)", Work in Progress. + + [6] Comer, D., and R. Droms, "Uniform Access to Internet Directory + Services", Proc. of ACM SIGCOMM '90 (Special issue of Computer + Communications Review), 20(4):50--59, 1990. + + [7] Croft, B., and J. Gilmore, "Bootstrap Protocol (BOOTP)", RFC 951, + Stanford and SUN Microsystems, September 1985. + + [8] Deering, S., "ICMP Router Discovery Messages", RFC 1256, Xerox + PARC, September 1991. + + [9] Droms, D., "Interoperation between DHCP and BOOTP", RFC 1534, + Bucknell University, October 1993. + + + + + +Droms Standards Track [Page 42] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + [10] Finlayson, R., Mann, T., Mogul, J., and M. Theimer, "A Reverse + Address Resolution Protocol", RFC 903, Stanford, June 1984. + + [11] Gray C., and D. Cheriton, "Leases: An Efficient Fault-Tolerant + Mechanism for Distributed File Cache Consistency", In Proc. of + the Twelfth ACM Symposium on Operating Systems Design, 1989. + + [12] Mockapetris, P., "Domain Names -- Concepts and Facilities", STD + 13, RFC 1034, USC/Information Sciences Institute, November 1987. + + [13] Mockapetris, P., "Domain Names -- Implementation and + Specification", STD 13, RFC 1035, USC/Information Sciences + Institute, November 1987. + + [14] Mogul J., and S. Deering, "Path MTU Discovery", RFC 1191, + November 1990. + + [15] Morgan, R., "Dynamic IP Address Assignment for Ethernet Attached + Hosts", Work in Progress. + + [16] Postel, J., "Internet Control Message Protocol", STD 5, RFC 792, + USC/Information Sciences Institute, September 1981. + + [17] Reynolds, J., "BOOTP Vendor Information Extensions", RFC 1497, + USC/Information Sciences Institute, August 1993. + + [18] Reynolds, J., and J. Postel, "Assigned Numbers", STD 2, RFC 1700, + USC/Information Sciences Institute, October 1994. + + [19] Jeffrey Schiller and Mark Rosenstein. A Protocol for the Dynamic + Assignment of IP Addresses for use on an Ethernet. (Available + from the Athena Project, MIT), 1989. + + [20] Sollins, K., "The TFTP Protocol (Revision 2)", RFC 783, NIC, + June 1981. + + [21] Wimer, W., "Clarifications and Extensions for the Bootstrap + Protocol", RFC 1542, Carnegie Mellon University, October 1993. + +7. Security Considerations + + DHCP is built directly on UDP and IP which are as yet inherently + insecure. Furthermore, DHCP is generally intended to make + maintenance of remote and/or diskless hosts easier. While perhaps + not impossible, configuring such hosts with passwords or keys may be + difficult and inconvenient. Therefore, DHCP in its current form is + quite insecure. + + + + +Droms Standards Track [Page 43] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + + Unauthorized DHCP servers may be easily set up. Such servers can + then send false and potentially disruptive information to clients + such as incorrect or duplicate IP addresses, incorrect routing + information (including spoof routers, etc.), incorrect domain + nameserver addresses (such as spoof nameservers), and so on. + Clearly, once this seed information is in place, an attacker can + further compromise affected systems. + + Malicious DHCP clients could masquerade as legitimate clients and + retrieve information intended for those legitimate clients. Where + dynamic allocation of resources is used, a malicious client could + claim all resources for itself, thereby denying resources to + legitimate clients. + +8. Author's Address + + Ralph Droms + Computer Science Department + 323 Dana Engineering + Bucknell University + Lewisburg, PA 17837 + + Phone: (717) 524-1145 + EMail: droms@bucknell.edu + + + + + + + + + + + + + + + + + + + + + + + + + + + +Droms Standards Track [Page 44] + +RFC 2131 Dynamic Host Configuration Protocol March 1997 + + +A. Host Configuration Parameters + + IP-layer_parameters,_per_host:_ + + Be a router on/off HRC 3.1 + Non-local source routing on/off HRC 3.3.5 + Policy filters for + non-local source routing (list) HRC 3.3.5 + Maximum reassembly size integer HRC 3.3.2 + Default TTL integer HRC 3.2.1.7 + PMTU aging timeout integer MTU 6.6 + MTU plateau table (list) MTU 7 + IP-layer_parameters,_per_interface:_ + IP address (address) HRC 3.3.1.6 + Subnet mask (address mask) HRC 3.3.1.6 + MTU integer HRC 3.3.3 + All-subnets-MTU on/off HRC 3.3.3 + Broadcast address flavor 0x00000000/0xffffffff HRC 3.3.6 + Perform mask discovery on/off HRC 3.2.2.9 + Be a mask supplier on/off HRC 3.2.2.9 + Perform router discovery on/off RD 5.1 + Router solicitation address (address) RD 5.1 + Default routers, list of: + router address (address) HRC 3.3.1.6 + preference level integer HRC 3.3.1.6 + Static routes, list of: + destination (host/subnet/net) HRC 3.3.1.2 + destination mask (address mask) HRC 3.3.1.2 + type-of-service integer HRC 3.3.1.2 + first-hop router (address) HRC 3.3.1.2 + ignore redirects on/off HRC 3.3.1.2 + PMTU integer MTU 6.6 + perform PMTU discovery on/off MTU 6.6 + + Link-layer_parameters,_per_interface:_ + Trailers on/off HRC 2.3.1 + ARP cache timeout integer HRC 2.3.2.1 + Ethernet encapsulation (RFC 894/RFC 1042) HRC 2.3.3 + + TCP_parameters,_per_host:_ + TTL integer HRC 4.2.2.19 + Keep-alive interval integer HRC 4.2.3.6 + Keep-alive data size 0/1 HRC 4.2.3.6 + +Key: + + MTU = Path MTU Discovery (RFC 1191, Proposed Standard) + RD = Router Discovery (RFC 1256, Proposed Standard) + + + +Droms Standards Track [Page 45] + diff --git a/ext/picotcp/RFC/rfc2460.txt b/ext/picotcp/RFC/rfc2460.txt new file mode 100644 index 0000000..de7b7fa --- /dev/null +++ b/ext/picotcp/RFC/rfc2460.txt @@ -0,0 +1,2187 @@ + + + + + + +Network Working Group S. Deering +Request for Comments: 2460 Cisco +Obsoletes: 1883 R. Hinden +Category: Standards Track Nokia + December 1998 + + + Internet Protocol, Version 6 (IPv6) + Specification + +Status of this Memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (1998). All Rights Reserved. + +Abstract + + This document specifies version 6 of the Internet Protocol (IPv6), + also sometimes referred to as IP Next Generation or IPng. + +Table of Contents + + 1. Introduction..................................................2 + 2. Terminology...................................................3 + 3. IPv6 Header Format............................................4 + 4. IPv6 Extension Headers........................................6 + 4.1 Extension Header Order...................................7 + 4.2 Options..................................................9 + 4.3 Hop-by-Hop Options Header...............................11 + 4.4 Routing Header..........................................12 + 4.5 Fragment Header.........................................18 + 4.6 Destination Options Header..............................23 + 4.7 No Next Header..........................................24 + 5. Packet Size Issues...........................................24 + 6. Flow Labels..................................................25 + 7. Traffic Classes..............................................25 + 8. Upper-Layer Protocol Issues..................................27 + 8.1 Upper-Layer Checksums...................................27 + 8.2 Maximum Packet Lifetime.................................28 + 8.3 Maximum Upper-Layer Payload Size........................28 + 8.4 Responding to Packets Carrying Routing Headers..........29 + + + +Deering & Hinden Standards Track [Page 1] + +RFC 2460 IPv6 Specification December 1998 + + + Appendix A. Semantics and Usage of the Flow Label Field.........30 + Appendix B. Formatting Guidelines for Options...................32 + Security Considerations.........................................35 + Acknowledgments.................................................35 + Authors' Addresses..............................................35 + References......................................................35 + Changes Since RFC-1883..........................................36 + Full Copyright Statement........................................39 + +1. Introduction + + IP version 6 (IPv6) is a new version of the Internet Protocol, + designed as the successor to IP version 4 (IPv4) [RFC-791]. The + changes from IPv4 to IPv6 fall primarily into the following + categories: + + o Expanded Addressing Capabilities + + IPv6 increases the IP address size from 32 bits to 128 bits, to + support more levels of addressing hierarchy, a much greater + number of addressable nodes, and simpler auto-configuration of + addresses. The scalability of multicast routing is improved by + adding a "scope" field to multicast addresses. And a new type + of address called an "anycast address" is defined, used to send + a packet to any one of a group of nodes. + + o Header Format Simplification + + Some IPv4 header fields have been dropped or made optional, to + reduce the common-case processing cost of packet handling and + to limit the bandwidth cost of the IPv6 header. + + o Improved Support for Extensions and Options + + Changes in the way IP header options are encoded allows for + more efficient forwarding, less stringent limits on the length + of options, and greater flexibility for introducing new options + in the future. + + o Flow Labeling Capability + + A new capability is added to enable the labeling of packets + belonging to particular traffic "flows" for which the sender + requests special handling, such as non-default quality of + service or "real-time" service. + + + + + + +Deering & Hinden Standards Track [Page 2] + +RFC 2460 IPv6 Specification December 1998 + + + o Authentication and Privacy Capabilities + + Extensions to support authentication, data integrity, and + (optional) data confidentiality are specified for IPv6. + + This document specifies the basic IPv6 header and the initially- + defined IPv6 extension headers and options. It also discusses packet + size issues, the semantics of flow labels and traffic classes, and + the effects of IPv6 on upper-layer protocols. The format and + semantics of IPv6 addresses are specified separately in [ADDRARCH]. + The IPv6 version of ICMP, which all IPv6 implementations are required + to include, is specified in [ICMPv6]. + +2. Terminology + + node - a device that implements IPv6. + + router - a node that forwards IPv6 packets not explicitly + addressed to itself. [See Note below]. + + host - any node that is not a router. [See Note below]. + + upper layer - a protocol layer immediately above IPv6. Examples are + transport protocols such as TCP and UDP, control + protocols such as ICMP, routing protocols such as OSPF, + and internet or lower-layer protocols being "tunneled" + over (i.e., encapsulated in) IPv6 such as IPX, + AppleTalk, or IPv6 itself. + + link - a communication facility or medium over which nodes can + communicate at the link layer, i.e., the layer + immediately below IPv6. Examples are Ethernets (simple + or bridged); PPP links; X.25, Frame Relay, or ATM + networks; and internet (or higher) layer "tunnels", + such as tunnels over IPv4 or IPv6 itself. + + neighbors - nodes attached to the same link. + + interface - a node's attachment to a link. + + address - an IPv6-layer identifier for an interface or a set of + interfaces. + + packet - an IPv6 header plus payload. + + link MTU - the maximum transmission unit, i.e., maximum packet + size in octets, that can be conveyed over a link. + + + + +Deering & Hinden Standards Track [Page 3] + +RFC 2460 IPv6 Specification December 1998 + + + path MTU - the minimum link MTU of all the links in a path between + a source node and a destination node. + + Note: it is possible, though unusual, for a device with multiple + interfaces to be configured to forward non-self-destined packets + arriving from some set (fewer than all) of its interfaces, and to + discard non-self-destined packets arriving from its other interfaces. + Such a device must obey the protocol requirements for routers when + receiving packets from, and interacting with neighbors over, the + former (forwarding) interfaces. It must obey the protocol + requirements for hosts when receiving packets from, and interacting + with neighbors over, the latter (non-forwarding) interfaces. + +3. IPv6 Header Format + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |Version| Traffic Class | Flow Label | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Payload Length | Next Header | Hop Limit | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + Source Address + + | | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + Destination Address + + | | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 4-bit Internet Protocol version number = 6. + + Traffic Class 8-bit traffic class field. See section 7. + + Flow Label 20-bit flow label. See section 6. + + Payload Length 16-bit unsigned integer. Length of the IPv6 + payload, i.e., the rest of the packet following + this IPv6 header, in octets. (Note that any + + + + + +Deering & Hinden Standards Track [Page 4] + +RFC 2460 IPv6 Specification December 1998 + + + extension headers [section 4] present are + considered part of the payload, i.e., included + in the length count.) + + Next Header 8-bit selector. Identifies the type of header + immediately following the IPv6 header. Uses the + same values as the IPv4 Protocol field [RFC-1700 + et seq.]. + + Hop Limit 8-bit unsigned integer. Decremented by 1 by + each node that forwards the packet. The packet + is discarded if Hop Limit is decremented to + zero. + + Source Address 128-bit address of the originator of the packet. + See [ADDRARCH]. + + Destination Address 128-bit address of the intended recipient of the + packet (possibly not the ultimate recipient, if + a Routing header is present). See [ADDRARCH] + and section 4.4. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 5] + +RFC 2460 IPv6 Specification December 1998 + + +4. IPv6 Extension Headers + + In IPv6, optional internet-layer information is encoded in separate + headers that may be placed between the IPv6 header and the upper- + layer header in a packet. There are a small number of such extension + headers, each identified by a distinct Next Header value. As + illustrated in these examples, an IPv6 packet may carry zero, one, or + more extension headers, each identified by the Next Header field of + the preceding header: + + +---------------+------------------------ + | IPv6 header | TCP header + data + | | + | Next Header = | + | TCP | + +---------------+------------------------ + + + +---------------+----------------+------------------------ + | IPv6 header | Routing header | TCP header + data + | | | + | Next Header = | Next Header = | + | Routing | TCP | + +---------------+----------------+------------------------ + + + +---------------+----------------+-----------------+----------------- + | IPv6 header | Routing header | Fragment header | fragment of TCP + | | | | header + data + | Next Header = | Next Header = | Next Header = | + | Routing | Fragment | TCP | + +---------------+----------------+-----------------+----------------- + + With one exception, extension headers are not examined or processed + by any node along a packet's delivery path, until the packet reaches + the node (or each of the set of nodes, in the case of multicast) + identified in the Destination Address field of the IPv6 header. + There, normal demultiplexing on the Next Header field of the IPv6 + header invokes the module to process the first extension header, or + the upper-layer header if no extension header is present. The + contents and semantics of each extension header determine whether or + not to proceed to the next header. Therefore, extension headers must + be processed strictly in the order they appear in the packet; a + receiver must not, for example, scan through a packet looking for a + particular kind of extension header and process that header prior to + processing all preceding ones. + + + + + +Deering & Hinden Standards Track [Page 6] + +RFC 2460 IPv6 Specification December 1998 + + + The exception referred to in the preceding paragraph is the Hop-by- + Hop Options header, which carries information that must be examined + and processed by every node along a packet's delivery path, including + the source and destination nodes. The Hop-by-Hop Options header, + when present, must immediately follow the IPv6 header. Its presence + is indicated by the value zero in the Next Header field of the IPv6 + header. + + If, as a result of processing a header, a node is required to proceed + to the next header but the Next Header value in the current header is + unrecognized by the node, it should discard the packet and send an + ICMP Parameter Problem message to the source of the packet, with an + ICMP Code value of 1 ("unrecognized Next Header type encountered") + and the ICMP Pointer field containing the offset of the unrecognized + value within the original packet. The same action should be taken if + a node encounters a Next Header value of zero in any header other + than an IPv6 header. + + Each extension header is an integer multiple of 8 octets long, in + order to retain 8-octet alignment for subsequent headers. Multi- + octet fields within each extension header are aligned on their + natural boundaries, i.e., fields of width n octets are placed at an + integer multiple of n octets from the start of the header, for n = 1, + 2, 4, or 8. + + A full implementation of IPv6 includes implementation of the + following extension headers: + + Hop-by-Hop Options + Routing (Type 0) + Fragment + Destination Options + Authentication + Encapsulating Security Payload + + The first four are specified in this document; the last two are + specified in [RFC-2402] and [RFC-2406], respectively. + +4.1 Extension Header Order + + When more than one extension header is used in the same packet, it is + recommended that those headers appear in the following order: + + IPv6 header + Hop-by-Hop Options header + Destination Options header (note 1) + Routing header + Fragment header + + + +Deering & Hinden Standards Track [Page 7] + +RFC 2460 IPv6 Specification December 1998 + + + Authentication header (note 2) + Encapsulating Security Payload header (note 2) + Destination Options header (note 3) + upper-layer header + + note 1: for options to be processed by the first destination + that appears in the IPv6 Destination Address field + plus subsequent destinations listed in the Routing + header. + + note 2: additional recommendations regarding the relative + order of the Authentication and Encapsulating + Security Payload headers are given in [RFC-2406]. + + note 3: for options to be processed only by the final + destination of the packet. + + Each extension header should occur at most once, except for the + Destination Options header which should occur at most twice (once + before a Routing header and once before the upper-layer header). + + If the upper-layer header is another IPv6 header (in the case of IPv6 + being tunneled over or encapsulated in IPv6), it may be followed by + its own extension headers, which are separately subject to the same + ordering recommendations. + + If and when other extension headers are defined, their ordering + constraints relative to the above listed headers must be specified. + + IPv6 nodes must accept and attempt to process extension headers in + any order and occurring any number of times in the same packet, + except for the Hop-by-Hop Options header which is restricted to + appear immediately after an IPv6 header only. Nonetheless, it is + strongly advised that sources of IPv6 packets adhere to the above + recommended order until and unless subsequent specifications revise + that recommendation. + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 8] + +RFC 2460 IPv6 Specification December 1998 + + +4.2 Options + + Two of the currently-defined extension headers -- the Hop-by-Hop + Options header and the Destination Options header -- carry a variable + number of type-length-value (TLV) encoded "options", of the following + format: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- - - - - - - - - + | Option Type | Opt Data Len | Option Data + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- - - - - - - - - + + Option Type 8-bit identifier of the type of option. + + Opt Data Len 8-bit unsigned integer. Length of the Option + Data field of this option, in octets. + + Option Data Variable-length field. Option-Type-specific + data. + + The sequence of options within a header must be processed strictly in + the order they appear in the header; a receiver must not, for + example, scan through the header looking for a particular kind of + option and process that option prior to processing all preceding + ones. + + The Option Type identifiers are internally encoded such that their + highest-order two bits specify the action that must be taken if the + processing IPv6 node does not recognize the Option Type: + + 00 - skip over this option and continue processing the header. + + 01 - discard the packet. + + 10 - discard the packet and, regardless of whether or not the + packet's Destination Address was a multicast address, send an + ICMP Parameter Problem, Code 2, message to the packet's + Source Address, pointing to the unrecognized Option Type. + + 11 - discard the packet and, only if the packet's Destination + Address was not a multicast address, send an ICMP Parameter + Problem, Code 2, message to the packet's Source Address, + pointing to the unrecognized Option Type. + + The third-highest-order bit of the Option Type specifies whether or + not the Option Data of that option can change en-route to the + packet's final destination. When an Authentication header is present + + + + + +Deering & Hinden Standards Track [Page 9] + +RFC 2460 IPv6 Specification December 1998 + + + in the packet, for any option whose data may change en-route, its + entire Option Data field must be treated as zero-valued octets when + computing or verifying the packet's authenticating value. + + 0 - Option Data does not change en-route + + 1 - Option Data may change en-route + + The three high-order bits described above are to be treated as part + of the Option Type, not independent of the Option Type. That is, a + particular option is identified by a full 8-bit Option Type, not just + the low-order 5 bits of an Option Type. + + The same Option Type numbering space is used for both the Hop-by-Hop + Options header and the Destination Options header. However, the + specification of a particular option may restrict its use to only one + of those two headers. + + Individual options may have specific alignment requirements, to + ensure that multi-octet values within Option Data fields fall on + natural boundaries. The alignment requirement of an option is + specified using the notation xn+y, meaning the Option Type must + appear at an integer multiple of x octets from the start of the + header, plus y octets. For example: + + 2n means any 2-octet offset from the start of the header. + 8n+2 means any 8-octet offset from the start of the header, + plus 2 octets. + + There are two padding options which are used when necessary to align + subsequent options and to pad out the containing header to a multiple + of 8 octets in length. These padding options must be recognized by + all IPv6 implementations: + + Pad1 option (alignment requirement: none) + + +-+-+-+-+-+-+-+-+ + | 0 | + +-+-+-+-+-+-+-+-+ + + NOTE! the format of the Pad1 option is a special case -- it does + not have length and value fields. + + The Pad1 option is used to insert one octet of padding into the + Options area of a header. If more than one octet of padding is + required, the PadN option, described next, should be used, rather + than multiple Pad1 options. + + + + +Deering & Hinden Standards Track [Page 10] + +RFC 2460 IPv6 Specification December 1998 + + + PadN option (alignment requirement: none) + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- - - - - - - - - + | 1 | Opt Data Len | Option Data + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- - - - - - - - - + + The PadN option is used to insert two or more octets of padding + into the Options area of a header. For N octets of padding, the + Opt Data Len field contains the value N-2, and the Option Data + consists of N-2 zero-valued octets. + + Appendix B contains formatting guidelines for designing new options. + +4.3 Hop-by-Hop Options Header + + The Hop-by-Hop Options header is used to carry optional information + that must be examined by every node along a packet's delivery path. + The Hop-by-Hop Options header is identified by a Next Header value of + 0 in the IPv6 header, and has the following format: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + | | + . . + . Options . + . . + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Next Header 8-bit selector. Identifies the type of header + immediately following the Hop-by-Hop Options + header. Uses the same values as the IPv4 + Protocol field [RFC-1700 et seq.]. + + Hdr Ext Len 8-bit unsigned integer. Length of the Hop-by- + Hop Options header in 8-octet units, not + including the first 8 octets. + + Options Variable-length field, of length such that the + complete Hop-by-Hop Options header is an integer + multiple of 8 octets long. Contains one or more + TLV-encoded options, as described in section + 4.2. + + The only hop-by-hop options defined in this document are the Pad1 and + PadN options specified in section 4.2. + + + + +Deering & Hinden Standards Track [Page 11] + +RFC 2460 IPv6 Specification December 1998 + + +4.4 Routing Header + + The Routing header is used by an IPv6 source to list one or more + intermediate nodes to be "visited" on the way to a packet's + destination. This function is very similar to IPv4's Loose Source + and Record Route option. The Routing header is identified by a Next + Header value of 43 in the immediately preceding header, and has the + following format: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len | Routing Type | Segments Left | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + . . + . type-specific data . + . . + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Next Header 8-bit selector. Identifies the type of header + immediately following the Routing header. Uses + the same values as the IPv4 Protocol field + [RFC-1700 et seq.]. + + Hdr Ext Len 8-bit unsigned integer. Length of the Routing + header in 8-octet units, not including the first + 8 octets. + + Routing Type 8-bit identifier of a particular Routing header + variant. + + Segments Left 8-bit unsigned integer. Number of route + segments remaining, i.e., number of explicitly + listed intermediate nodes still to be visited + before reaching the final destination. + + type-specific data Variable-length field, of format determined by + the Routing Type, and of length such that the + complete Routing header is an integer multiple + of 8 octets long. + + If, while processing a received packet, a node encounters a Routing + header with an unrecognized Routing Type value, the required behavior + of the node depends on the value of the Segments Left field, as + follows: + + + + + + +Deering & Hinden Standards Track [Page 12] + +RFC 2460 IPv6 Specification December 1998 + + + If Segments Left is zero, the node must ignore the Routing header + and proceed to process the next header in the packet, whose type + is identified by the Next Header field in the Routing header. + + If Segments Left is non-zero, the node must discard the packet and + send an ICMP Parameter Problem, Code 0, message to the packet's + Source Address, pointing to the unrecognized Routing Type. + + If, after processing a Routing header of a received packet, an + intermediate node determines that the packet is to be forwarded onto + a link whose link MTU is less than the size of the packet, the node + must discard the packet and send an ICMP Packet Too Big message to + the packet's Source Address. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 13] + +RFC 2460 IPv6 Specification December 1998 + + + The Type 0 Routing header has the following format: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len | Routing Type=0| Segments Left | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Reserved | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + Address[1] + + | | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + Address[2] + + | | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + . . . + . . . + . . . + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + Address[n] + + | | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Next Header 8-bit selector. Identifies the type of header + immediately following the Routing header. Uses + the same values as the IPv4 Protocol field + [RFC-1700 et seq.]. + + Hdr Ext Len 8-bit unsigned integer. Length of the Routing + header in 8-octet units, not including the first + 8 octets. For the Type 0 Routing header, Hdr + Ext Len is equal to two times the number of + addresses in the header. + + Routing Type 0. + + + +Deering & Hinden Standards Track [Page 14] + +RFC 2460 IPv6 Specification December 1998 + + + Segments Left 8-bit unsigned integer. Number of route + segments remaining, i.e., number of explicitly + listed intermediate nodes still to be visited + before reaching the final destination. + + Reserved 32-bit reserved field. Initialized to zero for + transmission; ignored on reception. + + Address[1..n] Vector of 128-bit addresses, numbered 1 to n. + + Multicast addresses must not appear in a Routing header of Type 0, or + in the IPv6 Destination Address field of a packet carrying a Routing + header of Type 0. + + A Routing header is not examined or processed until it reaches the + node identified in the Destination Address field of the IPv6 header. + In that node, dispatching on the Next Header field of the immediately + preceding header causes the Routing header module to be invoked, + which, in the case of Routing Type 0, performs the following + algorithm: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 15] + +RFC 2460 IPv6 Specification December 1998 + + + if Segments Left = 0 { + proceed to process the next header in the packet, whose type is + identified by the Next Header field in the Routing header + } + else if Hdr Ext Len is odd { + send an ICMP Parameter Problem, Code 0, message to the Source + Address, pointing to the Hdr Ext Len field, and discard the + packet + } + else { + compute n, the number of addresses in the Routing header, by + dividing Hdr Ext Len by 2 + + if Segments Left is greater than n { + send an ICMP Parameter Problem, Code 0, message to the Source + Address, pointing to the Segments Left field, and discard the + packet + } + else { + decrement Segments Left by 1; + compute i, the index of the next address to be visited in + the address vector, by subtracting Segments Left from n + + if Address [i] or the IPv6 Destination Address is multicast { + discard the packet + } + else { + swap the IPv6 Destination Address and Address[i] + + if the IPv6 Hop Limit is less than or equal to 1 { + send an ICMP Time Exceeded -- Hop Limit Exceeded in + Transit message to the Source Address and discard the + packet + } + else { + decrement the Hop Limit by 1 + + resubmit the packet to the IPv6 module for transmission + to the new destination + } + } + } + } + + + + + + + + +Deering & Hinden Standards Track [Page 16] + +RFC 2460 IPv6 Specification December 1998 + + + As an example of the effects of the above algorithm, consider the + case of a source node S sending a packet to destination node D, using + a Routing header to cause the packet to be routed via intermediate + nodes I1, I2, and I3. The values of the relevant IPv6 header and + Routing header fields on each segment of the delivery path would be + as follows: + + As the packet travels from S to I1: + + Source Address = S Hdr Ext Len = 6 + Destination Address = I1 Segments Left = 3 + Address[1] = I2 + Address[2] = I3 + Address[3] = D + + As the packet travels from I1 to I2: + + Source Address = S Hdr Ext Len = 6 + Destination Address = I2 Segments Left = 2 + Address[1] = I1 + Address[2] = I3 + Address[3] = D + + As the packet travels from I2 to I3: + + Source Address = S Hdr Ext Len = 6 + Destination Address = I3 Segments Left = 1 + Address[1] = I1 + Address[2] = I2 + Address[3] = D + + As the packet travels from I3 to D: + + Source Address = S Hdr Ext Len = 6 + Destination Address = D Segments Left = 0 + Address[1] = I1 + Address[2] = I2 + Address[3] = I3 + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 17] + +RFC 2460 IPv6 Specification December 1998 + + +4.5 Fragment Header + + The Fragment header is used by an IPv6 source to send a packet larger + than would fit in the path MTU to its destination. (Note: unlike + IPv4, fragmentation in IPv6 is performed only by source nodes, not by + routers along a packet's delivery path -- see section 5.) The + Fragment header is identified by a Next Header value of 44 in the + immediately preceding header, and has the following format: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Reserved | Fragment Offset |Res|M| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Identification | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Next Header 8-bit selector. Identifies the initial header + type of the Fragmentable Part of the original + packet (defined below). Uses the same values as + the IPv4 Protocol field [RFC-1700 et seq.]. + + Reserved 8-bit reserved field. Initialized to zero for + transmission; ignored on reception. + + Fragment Offset 13-bit unsigned integer. The offset, in 8-octet + units, of the data following this header, + relative to the start of the Fragmentable Part + of the original packet. + + Res 2-bit reserved field. Initialized to zero for + transmission; ignored on reception. + + M flag 1 = more fragments; 0 = last fragment. + + Identification 32 bits. See description below. + + In order to send a packet that is too large to fit in the MTU of the + path to its destination, a source node may divide the packet into + fragments and send each fragment as a separate packet, to be + reassembled at the receiver. + + For every packet that is to be fragmented, the source node generates + an Identification value. The Identification must be different than + that of any other fragmented packet sent recently* with the same + Source Address and Destination Address. If a Routing header is + present, the Destination Address of concern is that of the final + destination. + + + + + +Deering & Hinden Standards Track [Page 18] + +RFC 2460 IPv6 Specification December 1998 + + + * "recently" means within the maximum likely lifetime of a packet, + including transit time from source to destination and time spent + awaiting reassembly with other fragments of the same packet. + However, it is not required that a source node know the maximum + packet lifetime. Rather, it is assumed that the requirement can + be met by maintaining the Identification value as a simple, 32- + bit, "wrap-around" counter, incremented each time a packet must + be fragmented. It is an implementation choice whether to + maintain a single counter for the node or multiple counters, + e.g., one for each of the node's possible source addresses, or + one for each active (source address, destination address) + combination. + + The initial, large, unfragmented packet is referred to as the + "original packet", and it is considered to consist of two parts, as + illustrated: + + original packet: + + +------------------+----------------------//-----------------------+ + | Unfragmentable | Fragmentable | + | Part | Part | + +------------------+----------------------//-----------------------+ + + The Unfragmentable Part consists of the IPv6 header plus any + extension headers that must be processed by nodes en route to the + destination, that is, all headers up to and including the Routing + header if present, else the Hop-by-Hop Options header if present, + else no extension headers. + + The Fragmentable Part consists of the rest of the packet, that is, + any extension headers that need be processed only by the final + destination node(s), plus the upper-layer header and data. + + The Fragmentable Part of the original packet is divided into + fragments, each, except possibly the last ("rightmost") one, being an + integer multiple of 8 octets long. The fragments are transmitted in + separate "fragment packets" as illustrated: + + original packet: + + +------------------+--------------+--------------+--//--+----------+ + | Unfragmentable | first | second | | last | + | Part | fragment | fragment | .... | fragment | + +------------------+--------------+--------------+--//--+----------+ + + + + + + +Deering & Hinden Standards Track [Page 19] + +RFC 2460 IPv6 Specification December 1998 + + + fragment packets: + + +------------------+--------+--------------+ + | Unfragmentable |Fragment| first | + | Part | Header | fragment | + +------------------+--------+--------------+ + + +------------------+--------+--------------+ + | Unfragmentable |Fragment| second | + | Part | Header | fragment | + +------------------+--------+--------------+ + o + o + o + +------------------+--------+----------+ + | Unfragmentable |Fragment| last | + | Part | Header | fragment | + +------------------+--------+----------+ + + Each fragment packet is composed of: + + (1) The Unfragmentable Part of the original packet, with the + Payload Length of the original IPv6 header changed to contain + the length of this fragment packet only (excluding the length + of the IPv6 header itself), and the Next Header field of the + last header of the Unfragmentable Part changed to 44. + + (2) A Fragment header containing: + + The Next Header value that identifies the first header of + the Fragmentable Part of the original packet. + + A Fragment Offset containing the offset of the fragment, + in 8-octet units, relative to the start of the + Fragmentable Part of the original packet. The Fragment + Offset of the first ("leftmost") fragment is 0. + + An M flag value of 0 if the fragment is the last + ("rightmost") one, else an M flag value of 1. + + The Identification value generated for the original + packet. + + (3) The fragment itself. + + The lengths of the fragments must be chosen such that the resulting + fragment packets fit within the MTU of the path to the packets' + destination(s). + + + +Deering & Hinden Standards Track [Page 20] + +RFC 2460 IPv6 Specification December 1998 + + + At the destination, fragment packets are reassembled into their + original, unfragmented form, as illustrated: + + reassembled original packet: + + +------------------+----------------------//------------------------+ + | Unfragmentable | Fragmentable | + | Part | Part | + +------------------+----------------------//------------------------+ + + The following rules govern reassembly: + + An original packet is reassembled only from fragment packets that + have the same Source Address, Destination Address, and Fragment + Identification. + + The Unfragmentable Part of the reassembled packet consists of all + headers up to, but not including, the Fragment header of the first + fragment packet (that is, the packet whose Fragment Offset is + zero), with the following two changes: + + The Next Header field of the last header of the Unfragmentable + Part is obtained from the Next Header field of the first + fragment's Fragment header. + + The Payload Length of the reassembled packet is computed from + the length of the Unfragmentable Part and the length and offset + of the last fragment. For example, a formula for computing the + Payload Length of the reassembled original packet is: + + PL.orig = PL.first - FL.first - 8 + (8 * FO.last) + FL.last + + where + PL.orig = Payload Length field of reassembled packet. + PL.first = Payload Length field of first fragment packet. + FL.first = length of fragment following Fragment header of + first fragment packet. + FO.last = Fragment Offset field of Fragment header of + last fragment packet. + FL.last = length of fragment following Fragment header of + last fragment packet. + + The Fragmentable Part of the reassembled packet is constructed + from the fragments following the Fragment headers in each of the + fragment packets. The length of each fragment is computed by + subtracting from the packet's Payload Length the length of the + + + + + +Deering & Hinden Standards Track [Page 21] + +RFC 2460 IPv6 Specification December 1998 + + + headers between the IPv6 header and fragment itself; its relative + position in Fragmentable Part is computed from its Fragment Offset + value. + + The Fragment header is not present in the final, reassembled + packet. + + The following error conditions may arise when reassembling fragmented + packets: + + If insufficient fragments are received to complete reassembly of a + packet within 60 seconds of the reception of the first-arriving + fragment of that packet, reassembly of that packet must be + abandoned and all the fragments that have been received for that + packet must be discarded. If the first fragment (i.e., the one + with a Fragment Offset of zero) has been received, an ICMP Time + Exceeded -- Fragment Reassembly Time Exceeded message should be + sent to the source of that fragment. + + If the length of a fragment, as derived from the fragment packet's + Payload Length field, is not a multiple of 8 octets and the M flag + of that fragment is 1, then that fragment must be discarded and an + ICMP Parameter Problem, Code 0, message should be sent to the + source of the fragment, pointing to the Payload Length field of + the fragment packet. + + If the length and offset of a fragment are such that the Payload + Length of the packet reassembled from that fragment would exceed + 65,535 octets, then that fragment must be discarded and an ICMP + Parameter Problem, Code 0, message should be sent to the source of + the fragment, pointing to the Fragment Offset field of the + fragment packet. + + The following conditions are not expected to occur, but are not + considered errors if they do: + + The number and content of the headers preceding the Fragment + header of different fragments of the same original packet may + differ. Whatever headers are present, preceding the Fragment + header in each fragment packet, are processed when the packets + arrive, prior to queueing the fragments for reassembly. Only + those headers in the Offset zero fragment packet are retained in + the reassembled packet. + + The Next Header values in the Fragment headers of different + fragments of the same original packet may differ. Only the value + from the Offset zero fragment packet is used for reassembly. + + + + +Deering & Hinden Standards Track [Page 22] + +RFC 2460 IPv6 Specification December 1998 + + +4.6 Destination Options Header + + The Destination Options header is used to carry optional information + that need be examined only by a packet's destination node(s). The + Destination Options header is identified by a Next Header value of 60 + in the immediately preceding header, and has the following format: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + | | + . . + . Options . + . . + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Next Header 8-bit selector. Identifies the type of header + immediately following the Destination Options + header. Uses the same values as the IPv4 + Protocol field [RFC-1700 et seq.]. + + Hdr Ext Len 8-bit unsigned integer. Length of the + Destination Options header in 8-octet units, not + including the first 8 octets. + + Options Variable-length field, of length such that the + complete Destination Options header is an + integer multiple of 8 octets long. Contains one + or more TLV-encoded options, as described in + section 4.2. + + The only destination options defined in this document are the Pad1 + and PadN options specified in section 4.2. + + Note that there are two possible ways to encode optional destination + information in an IPv6 packet: either as an option in the Destination + Options header, or as a separate extension header. The Fragment + header and the Authentication header are examples of the latter + approach. Which approach can be used depends on what action is + desired of a destination node that does not understand the optional + information: + + o If the desired action is for the destination node to discard + the packet and, only if the packet's Destination Address is not + a multicast address, send an ICMP Unrecognized Type message to + the packet's Source Address, then the information may be + encoded either as a separate header or as an option in the + + + +Deering & Hinden Standards Track [Page 23] + +RFC 2460 IPv6 Specification December 1998 + + + Destination Options header whose Option Type has the value 11 + in its highest-order two bits. The choice may depend on such + factors as which takes fewer octets, or which yields better + alignment or more efficient parsing. + + o If any other action is desired, the information must be encoded + as an option in the Destination Options header whose Option + Type has the value 00, 01, or 10 in its highest-order two bits, + specifying the desired action (see section 4.2). + +4.7 No Next Header + + The value 59 in the Next Header field of an IPv6 header or any + extension header indicates that there is nothing following that + header. If the Payload Length field of the IPv6 header indicates the + presence of octets past the end of a header whose Next Header field + contains 59, those octets must be ignored, and passed on unchanged if + the packet is forwarded. + +5. Packet Size Issues + + IPv6 requires that every link in the internet have an MTU of 1280 + octets or greater. On any link that cannot convey a 1280-octet + packet in one piece, link-specific fragmentation and reassembly must + be provided at a layer below IPv6. + + Links that have a configurable MTU (for example, PPP links [RFC- + 1661]) must be configured to have an MTU of at least 1280 octets; it + is recommended that they be configured with an MTU of 1500 octets or + greater, to accommodate possible encapsulations (i.e., tunneling) + without incurring IPv6-layer fragmentation. + + From each link to which a node is directly attached, the node must be + able to accept packets as large as that link's MTU. + + It is strongly recommended that IPv6 nodes implement Path MTU + Discovery [RFC-1981], in order to discover and take advantage of path + MTUs greater than 1280 octets. However, a minimal IPv6 + implementation (e.g., in a boot ROM) may simply restrict itself to + sending packets no larger than 1280 octets, and omit implementation + of Path MTU Discovery. + + In order to send a packet larger than a path's MTU, a node may use + the IPv6 Fragment header to fragment the packet at the source and + have it reassembled at the destination(s). However, the use of such + fragmentation is discouraged in any application that is able to + adjust its packets to fit the measured path MTU (i.e., down to 1280 + octets). + + + +Deering & Hinden Standards Track [Page 24] + +RFC 2460 IPv6 Specification December 1998 + + + A node must be able to accept a fragmented packet that, after + reassembly, is as large as 1500 octets. A node is permitted to + accept fragmented packets that reassemble to more than 1500 octets. + An upper-layer protocol or application that depends on IPv6 + fragmentation to send packets larger than the MTU of a path should + not send packets larger than 1500 octets unless it has assurance that + the destination is capable of reassembling packets of that larger + size. + + In response to an IPv6 packet that is sent to an IPv4 destination + (i.e., a packet that undergoes translation from IPv6 to IPv4), the + originating IPv6 node may receive an ICMP Packet Too Big message + reporting a Next-Hop MTU less than 1280. In that case, the IPv6 node + is not required to reduce the size of subsequent packets to less than + 1280, but must include a Fragment header in those packets so that the + IPv6-to-IPv4 translating router can obtain a suitable Identification + value to use in resulting IPv4 fragments. Note that this means the + payload may have to be reduced to 1232 octets (1280 minus 40 for the + IPv6 header and 8 for the Fragment header), and smaller still if + additional extension headers are used. + +6. Flow Labels + + The 20-bit Flow Label field in the IPv6 header may be used by a + source to label sequences of packets for which it requests special + handling by the IPv6 routers, such as non-default quality of service + or "real-time" service. This aspect of IPv6 is, at the time of + writing, still experimental and subject to change as the requirements + for flow support in the Internet become clearer. Hosts or routers + that do not support the functions of the Flow Label field are + required to set the field to zero when originating a packet, pass the + field on unchanged when forwarding a packet, and ignore the field + when receiving a packet. + + Appendix A describes the current intended semantics and usage of the + Flow Label field. + +7. Traffic Classes + + The 8-bit Traffic Class field in the IPv6 header is available for use + by originating nodes and/or forwarding routers to identify and + distinguish between different classes or priorities of IPv6 packets. + At the point in time at which this specification is being written, + there are a number of experiments underway in the use of the IPv4 + Type of Service and/or Precedence bits to provide various forms of + "differentiated service" for IP packets, other than through the use + of explicit flow set-up. The Traffic Class field in the IPv6 header + is intended to allow similar functionality to be supported in IPv6. + + + +Deering & Hinden Standards Track [Page 25] + +RFC 2460 IPv6 Specification December 1998 + + + It is hoped that those experiments will eventually lead to agreement + on what sorts of traffic classifications are most useful for IP + packets. Detailed definitions of the syntax and semantics of all or + some of the IPv6 Traffic Class bits, whether experimental or intended + for eventual standardization, are to be provided in separate + documents. + + The following general requirements apply to the Traffic Class field: + + o The service interface to the IPv6 service within a node must + provide a means for an upper-layer protocol to supply the value + of the Traffic Class bits in packets originated by that upper- + layer protocol. The default value must be zero for all 8 bits. + + o Nodes that support a specific (experimental or eventual + standard) use of some or all of the Traffic Class bits are + permitted to change the value of those bits in packets that + they originate, forward, or receive, as required for that + specific use. Nodes should ignore and leave unchanged any bits + of the Traffic Class field for which they do not support a + specific use. + + o An upper-layer protocol must not assume that the value of the + Traffic Class bits in a received packet are the same as the + value sent by the packet's source. + + + + + + + + + + + + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 26] + +RFC 2460 IPv6 Specification December 1998 + + +8. Upper-Layer Protocol Issues + +8.1 Upper-Layer Checksums + + Any transport or other upper-layer protocol that includes the + addresses from the IP header in its checksum computation must be + modified for use over IPv6, to include the 128-bit IPv6 addresses + instead of 32-bit IPv4 addresses. In particular, the following + illustration shows the TCP and UDP "pseudo-header" for IPv6: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + Source Address + + | | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + Destination Address + + | | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Upper-Layer Packet Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | zero | Next Header | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + o If the IPv6 packet contains a Routing header, the Destination + Address used in the pseudo-header is that of the final + destination. At the originating node, that address will be in + the last element of the Routing header; at the recipient(s), + that address will be in the Destination Address field of the + IPv6 header. + + o The Next Header value in the pseudo-header identifies the + upper-layer protocol (e.g., 6 for TCP, or 17 for UDP). It will + differ from the Next Header value in the IPv6 header if there + are extension headers between the IPv6 header and the upper- + layer header. + + o The Upper-Layer Packet Length in the pseudo-header is the + length of the upper-layer header and data (e.g., TCP header + plus TCP data). Some upper-layer protocols carry their own + + + +Deering & Hinden Standards Track [Page 27] + +RFC 2460 IPv6 Specification December 1998 + + + length information (e.g., the Length field in the UDP header); + for such protocols, that is the length used in the pseudo- + header. Other protocols (such as TCP) do not carry their own + length information, in which case the length used in the + pseudo-header is the Payload Length from the IPv6 header, minus + the length of any extension headers present between the IPv6 + header and the upper-layer header. + + o Unlike IPv4, when UDP packets are originated by an IPv6 node, + the UDP checksum is not optional. That is, whenever + originating a UDP packet, an IPv6 node must compute a UDP + checksum over the packet and the pseudo-header, and, if that + computation yields a result of zero, it must be changed to hex + FFFF for placement in the UDP header. IPv6 receivers must + discard UDP packets containing a zero checksum, and should log + the error. + + The IPv6 version of ICMP [ICMPv6] includes the above pseudo-header in + its checksum computation; this is a change from the IPv4 version of + ICMP, which does not include a pseudo-header in its checksum. The + reason for the change is to protect ICMP from misdelivery or + corruption of those fields of the IPv6 header on which it depends, + which, unlike IPv4, are not covered by an internet-layer checksum. + The Next Header field in the pseudo-header for ICMP contains the + value 58, which identifies the IPv6 version of ICMP. + +8.2 Maximum Packet Lifetime + + Unlike IPv4, IPv6 nodes are not required to enforce maximum packet + lifetime. That is the reason the IPv4 "Time to Live" field was + renamed "Hop Limit" in IPv6. In practice, very few, if any, IPv4 + implementations conform to the requirement that they limit packet + lifetime, so this is not a change in practice. Any upper-layer + protocol that relies on the internet layer (whether IPv4 or IPv6) to + limit packet lifetime ought to be upgraded to provide its own + mechanisms for detecting and discarding obsolete packets. + +8.3 Maximum Upper-Layer Payload Size + + When computing the maximum payload size available for upper-layer + data, an upper-layer protocol must take into account the larger size + of the IPv6 header relative to the IPv4 header. For example, in + IPv4, TCP's MSS option is computed as the maximum packet size (a + default value or a value learned through Path MTU Discovery) minus 40 + octets (20 octets for the minimum-length IPv4 header and 20 octets + for the minimum-length TCP header). When using TCP over IPv6, the + MSS must be computed as the maximum packet size minus 60 octets, + + + + +Deering & Hinden Standards Track [Page 28] + +RFC 2460 IPv6 Specification December 1998 + + + because the minimum-length IPv6 header (i.e., an IPv6 header with no + extension headers) is 20 octets longer than a minimum-length IPv4 + header. + +8.4 Responding to Packets Carrying Routing Headers + + When an upper-layer protocol sends one or more packets in response to + a received packet that included a Routing header, the response + packet(s) must not include a Routing header that was automatically + derived by "reversing" the received Routing header UNLESS the + integrity and authenticity of the received Source Address and Routing + header have been verified (e.g., via the use of an Authentication + header in the received packet). In other words, only the following + kinds of packets are permitted in response to a received packet + bearing a Routing header: + + o Response packets that do not carry Routing headers. + + o Response packets that carry Routing headers that were NOT + derived by reversing the Routing header of the received packet + (for example, a Routing header supplied by local + configuration). + + o Response packets that carry Routing headers that were derived + by reversing the Routing header of the received packet IF AND + ONLY IF the integrity and authenticity of the Source Address + and Routing header from the received packet have been verified + by the responder. + + + + + + + + + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 29] + +RFC 2460 IPv6 Specification December 1998 + + +Appendix A. Semantics and Usage of the Flow Label Field + + A flow is a sequence of packets sent from a particular source to a + particular (unicast or multicast) destination for which the source + desires special handling by the intervening routers. The nature of + that special handling might be conveyed to the routers by a control + protocol, such as a resource reservation protocol, or by information + within the flow's packets themselves, e.g., in a hop-by-hop option. + The details of such control protocols or options are beyond the scope + of this document. + + There may be multiple active flows from a source to a destination, as + well as traffic that is not associated with any flow. A flow is + uniquely identified by the combination of a source address and a + non-zero flow label. Packets that do not belong to a flow carry a + flow label of zero. + + A flow label is assigned to a flow by the flow's source node. New + flow labels must be chosen (pseudo-)randomly and uniformly from the + range 1 to FFFFF hex. The purpose of the random allocation is to + make any set of bits within the Flow Label field suitable for use as + a hash key by routers, for looking up the state associated with the + flow. + + All packets belonging to the same flow must be sent with the same + source address, destination address, and flow label. If any of those + packets includes a Hop-by-Hop Options header, then they all must be + originated with the same Hop-by-Hop Options header contents + (excluding the Next Header field of the Hop-by-Hop Options header). + If any of those packets includes a Routing header, then they all must + be originated with the same contents in all extension headers up to + and including the Routing header (excluding the Next Header field in + the Routing header). The routers or destinations are permitted, but + not required, to verify that these conditions are satisfied. If a + violation is detected, it should be reported to the source by an ICMP + Parameter Problem message, Code 0, pointing to the high-order octet + of the Flow Label field (i.e., offset 1 within the IPv6 packet). + + The maximum lifetime of any flow-handling state established along a + flow's path must be specified as part of the description of the + state-establishment mechanism, e.g., the resource reservation + protocol or the flow-setup hop-by-hop option. A source must not re- + use a flow label for a new flow within the maximum lifetime of any + flow-handling state that might have been established for the prior + use of that flow label. + + + + + + +Deering & Hinden Standards Track [Page 30] + +RFC 2460 IPv6 Specification December 1998 + + + When a node stops and restarts (e.g., as a result of a "crash"), it + must be careful not to use a flow label that it might have used for + an earlier flow whose lifetime may not have expired yet. This may be + accomplished by recording flow label usage on stable storage so that + it can be remembered across crashes, or by refraining from using any + flow labels until the maximum lifetime of any possible previously + established flows has expired. If the minimum time for rebooting the + node is known, that time can be deducted from the necessary waiting + period before starting to allocate flow labels. + + There is no requirement that all, or even most, packets belong to + flows, i.e., carry non-zero flow labels. This observation is placed + here to remind protocol designers and implementors not to assume + otherwise. For example, it would be unwise to design a router whose + performance would be adequate only if most packets belonged to flows, + or to design a header compression scheme that only worked on packets + that belonged to flows. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 31] + +RFC 2460 IPv6 Specification December 1998 + + +Appendix B. Formatting Guidelines for Options + + This appendix gives some advice on how to lay out the fields when + designing new options to be used in the Hop-by-Hop Options header or + the Destination Options header, as described in section 4.2. These + guidelines are based on the following assumptions: + + o One desirable feature is that any multi-octet fields within the + Option Data area of an option be aligned on their natural + boundaries, i.e., fields of width n octets should be placed at + an integer multiple of n octets from the start of the Hop-by- + Hop or Destination Options header, for n = 1, 2, 4, or 8. + + o Another desirable feature is that the Hop-by-Hop or Destination + Options header take up as little space as possible, subject to + the requirement that the header be an integer multiple of 8 + octets long. + + o It may be assumed that, when either of the option-bearing + headers are present, they carry a very small number of options, + usually only one. + + These assumptions suggest the following approach to laying out the + fields of an option: order the fields from smallest to largest, with + no interior padding, then derive the alignment requirement for the + entire option based on the alignment requirement of the largest field + (up to a maximum alignment of 8 octets). This approach is + illustrated in the following examples: + + Example 1 + + If an option X required two data fields, one of length 8 octets and + one of length 4 octets, it would be laid out as follows: + + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Option Type=X |Opt Data Len=12| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + 8-octet field + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + + + + +Deering & Hinden Standards Track [Page 32] + +RFC 2460 IPv6 Specification December 1998 + + + Its alignment requirement is 8n+2, to ensure that the 8-octet field + starts at a multiple-of-8 offset from the start of the enclosing + header. A complete Hop-by-Hop or Destination Options header + containing this one option would look as follows: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len=1 | Option Type=X |Opt Data Len=12| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + 8-octet field + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Example 2 + + If an option Y required three data fields, one of length 4 octets, + one of length 2 octets, and one of length 1 octet, it would be laid + out as follows: + + +-+-+-+-+-+-+-+-+ + | Option Type=Y | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |Opt Data Len=7 | 1-octet field | 2-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Its alignment requirement is 4n+3, to ensure that the 4-octet field + starts at a multiple-of-4 offset from the start of the enclosing + header. A complete Hop-by-Hop or Destination Options header + containing this one option would look as follows: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len=1 | Pad1 Option=0 | Option Type=Y | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |Opt Data Len=7 | 1-octet field | 2-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | PadN Option=1 |Opt Data Len=2 | 0 | 0 | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + + + + + +Deering & Hinden Standards Track [Page 33] + +RFC 2460 IPv6 Specification December 1998 + + + Example 3 + + A Hop-by-Hop or Destination Options header containing both options X + and Y from Examples 1 and 2 would have one of the two following + formats, depending on which option appeared first: + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len=3 | Option Type=X |Opt Data Len=12| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + 8-octet field + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | PadN Option=1 |Opt Data Len=1 | 0 | Option Type=Y | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |Opt Data Len=7 | 1-octet field | 2-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | PadN Option=1 |Opt Data Len=2 | 0 | 0 | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Header | Hdr Ext Len=3 | Pad1 Option=0 | Option Type=Y | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |Opt Data Len=7 | 1-octet field | 2-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | PadN Option=1 |Opt Data Len=4 | 0 | 0 | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0 | 0 | Option Type=X |Opt Data Len=12| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 4-octet field | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + 8-octet field + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + + + + + + +Deering & Hinden Standards Track [Page 34] + +RFC 2460 IPv6 Specification December 1998 + + +Security Considerations + + The security features of IPv6 are described in the Security + Architecture for the Internet Protocol [RFC-2401]. + +Acknowledgments + + The authors gratefully acknowledge the many helpful suggestions of + the members of the IPng working group, the End-to-End Protocols + research group, and the Internet Community At Large. + +Authors' Addresses + + Stephen E. Deering + Cisco Systems, Inc. + 170 West Tasman Drive + San Jose, CA 95134-1706 + USA + + Phone: +1 408 527 8213 + Fax: +1 408 527 8254 + EMail: deering@cisco.com + + + Robert M. Hinden + Nokia + 232 Java Drive + Sunnyvale, CA 94089 + USA + + Phone: +1 408 990-2004 + Fax: +1 408 743-5677 + EMail: hinden@iprg.nokia.com + +References + + [RFC-2401] Kent, S. and R. Atkinson, "Security Architecture for the + Internet Protocol", RFC 2401, November 1998. + + [RFC-2402] Kent, S. and R. Atkinson, "IP Authentication Header", + RFC 2402, November 1998. + + [RFC-2406] Kent, S. and R. Atkinson, "IP Encapsulating Security + Protocol (ESP)", RFC 2406, November 1998. + + [ICMPv6] Conta, A. and S. Deering, "ICMP for the Internet + Protocol Version 6 (IPv6)", RFC 2463, December 1998. + + + + +Deering & Hinden Standards Track [Page 35] + +RFC 2460 IPv6 Specification December 1998 + + + [ADDRARCH] Hinden, R. and S. Deering, "IP Version 6 Addressing + Architecture", RFC 2373, July 1998. + + [RFC-1981] McCann, J., Mogul, J. and S. Deering, "Path MTU + Discovery for IP version 6", RFC 1981, August 1996. + + [RFC-791] Postel, J., "Internet Protocol", STD 5, RFC 791, + September 1981. + + [RFC-1700] Reynolds, J. and J. Postel, "Assigned Numbers", STD 2, + RFC 1700, October 1994. See also: + http://www.iana.org/numbers.html + + [RFC-1661] Simpson, W., "The Point-to-Point Protocol (PPP)", STD + 51, RFC 1661, July 1994. + +CHANGES SINCE RFC-1883 + + This memo has the following changes from RFC-1883. Numbers identify + the Internet-Draft version in which the change was made. + + 02) Removed all references to jumbograms and the Jumbo Payload + option (moved to a separate document). + + 02) Moved most of Flow Label description from section 6 to (new) + Appendix A. + + 02) In Flow Label description, now in Appendix A, corrected maximum + Flow Label value from FFFFFF to FFFFF (i.e., one less "F") due + to reduction of size of Flow Label field from 24 bits to 20 + bits. + + 02) Renumbered (relettered?) the previous Appendix A to be Appendix + B. + + 02) Changed the wording of the Security Considerations section to + avoid dependency loop between this spec and the IPsec specs. + + 02) Updated R. Hinden's email address and company affiliation. + + + -------------------------------------------------------- + + 01) In section 3, changed field name "Class" to "Traffic Class" and + increased its size from 4 to 8 bits. Decreased size of Flow + Label field from 24 to 20 bits to compensate for increase in + Traffic Class field. + + + + +Deering & Hinden Standards Track [Page 36] + +RFC 2460 IPv6 Specification December 1998 + + + 01) In section 4.1, restored the order of the Authentication Header + and the ESP header, which were mistakenly swapped in the 00 + version of this memo. + + 01) In section 4.4, deleted the Strict/Loose Bit Map field and the + strict routing functionality from the Type 0 Routing header, and + removed the restriction on number of addresses that may be + carried in the Type 0 Routing header (was limited to 23 + addresses, because of the size of the strict/loose bit map). + + 01) In section 5, changed the minimum IPv6 MTU from 576 to 1280 + octets, and added a recommendation that links with configurable + MTU (e.g., PPP links) be configured to have an MTU of at least + 1500 octets. + + 01) In section 5, deleted the requirement that a node must not send + fragmented packets that reassemble to more than 1500 octets + without knowledge of the destination reassembly buffer size, and + replaced it with a recommendation that upper-layer protocols or + applications should not do that. + + 01) Replaced reference to the IPv4 Path MTU Discovery spec (RFC- + 1191) with reference to the IPv6 Path MTU Discovery spec (RFC- + 1981), and deleted the Notes at the end of section 5 regarding + Path MTU Discovery, since those details are now covered by RFC- + 1981. + + 01) In section 6, deleted specification of "opportunistic" flow + set-up, and removed all references to the 6-second maximum + lifetime for opportunistically established flow state. + + 01) In section 7, deleted the provisional description of the + internal structure and semantics of the Traffic Class field, and + specified that such descriptions be provided in separate + documents. + + -------------------------------------------------------- + + 00) In section 4, corrected the Code value to indicate "unrecognized + Next Header type encountered" in an ICMP Parameter Problem + message (changed from 2 to 1). + + 00) In the description of the Payload Length field in section 3, and + of the Jumbo Payload Length field in section 4.3, made it + clearer that extension headers are included in the payload + length count. + + + + + +Deering & Hinden Standards Track [Page 37] + +RFC 2460 IPv6 Specification December 1998 + + + 00) In section 4.1, swapped the order of the Authentication header + and the ESP header. (NOTE: this was a mistake, and the change + was undone in version 01.) + + 00) In section 4.2, made it clearer that options are identified by + the full 8-bit Option Type, not by the low-order 5 bits of an + Option Type. Also specified that the same Option Type numbering + space is used for both Hop-by-Hop Options and Destination + Options headers. + + 00) In section 4.4, added a sentence requiring that nodes processing + a Routing header must send an ICMP Packet Too Big message in + response to a packet that is too big to fit in the next hop link + (rather than, say, performing fragmentation). + + 00) Changed the name of the IPv6 Priority field to "Class", and + replaced the previous description of Priority in section 7 with + a description of the Class field. Also, excluded this field + from the set of fields that must remain the same for all packets + in the same flow, as specified in section 6. + + 00) In the pseudo-header in section 8.1, changed the name of the + "Payload Length" field to "Upper-Layer Packet Length". Also + clarified that, in the case of protocols that carry their own + length info (like non-jumbogram UDP), it is the upper-layer- + derived length, not the IP-layer-derived length, that is used in + the pseudo-header. + + 00) Added section 8.4, specifying that upper-layer protocols, when + responding to a received packet that carried a Routing header, + must not include the reverse of the Routing header in the + response packet(s) unless the received Routing header was + authenticated. + + 00) Fixed some typos and grammatical errors. + + 00) Authors' contact info updated. + + -------------------------------------------------------- + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 38] + +RFC 2460 IPv6 Specification December 1998 + + +Full Copyright Statement + + Copyright (C) The Internet Society (1998). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + + + + + + + + + + + + + + + + + + + + + + + + +Deering & Hinden Standards Track [Page 39] + diff --git a/ext/picotcp/RFC/rfc2525.txt b/ext/picotcp/RFC/rfc2525.txt new file mode 100644 index 0000000..d5f30e8 --- /dev/null +++ b/ext/picotcp/RFC/rfc2525.txt @@ -0,0 +1,3419 @@ + + + + + + +Network Working Group V. Paxson +Request for Comments: 2525 Editor +Category: Informational ACIRI / ICSI + M. Allman + NASA Glenn Research Center/Sterling Software + S. Dawson + Real-Time Computing Laboratory + W. Fenner + Xerox PARC + J. Griner + NASA Glenn Research Center + I. Heavens + Spider Software Ltd. + K. Lahey + NASA Ames Research Center/MRJ + J. Semke + Pittsburgh Supercomputing Center + B. Volz + Process Software Corporation + March 1999 + + + Known TCP Implementation Problems + +Status of this Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard of any kind. Distribution of this + memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (1999). All Rights Reserved. + +Table of Contents + + 1. INTRODUCTION....................................................2 + 2. KNOWN IMPLEMENTATION PROBLEMS...................................3 + 2.1 No initial slow start........................................3 + 2.2 No slow start after retransmission timeout...................6 + 2.3 Uninitialized CWND...........................................9 + 2.4 Inconsistent retransmission.................................11 + 2.5 Failure to retain above-sequence data.......................13 + 2.6 Extra additive constant in congestion avoidance.............17 + 2.7 Initial RTO too low.........................................23 + 2.8 Failure of window deflation after loss recovery.............26 + 2.9 Excessively short keepalive connection timeout..............28 + 2.10 Failure to back off retransmission timeout..................31 + + + +Paxson, et. al. Informational [Page 1] + +RFC 2525 TCP Implementation Problems March 1999 + + + 2.11 Insufficient interval between keepalives....................34 + 2.12 Window probe deadlock.......................................36 + 2.13 Stretch ACK violation.......................................40 + 2.14 Retransmission sends multiple packets.......................43 + 2.15 Failure to send FIN notification promptly...................45 + 2.16 Failure to send a RST after Half Duplex Close...............47 + 2.17 Failure to RST on close with data pending...................50 + 2.18 Options missing from TCP MSS calculation....................54 + 3. SECURITY CONSIDERATIONS........................................56 + 4. ACKNOWLEDGEMENTS...............................................56 + 5. REFERENCES.....................................................57 + 6. AUTHORS' ADDRESSES.............................................58 + 7. FULL COPYRIGHT STATEMENT.......................................60 + +1. Introduction + + This memo catalogs a number of known TCP implementation problems. + The goal in doing so is to improve conditions in the existing + Internet by enhancing the quality of current TCP/IP implementations. + It is hoped that both performance and correctness issues can be + resolved by making implementors aware of the problems and their + solutions. In the long term, it is hoped that this will provide a + reduction in unnecessary traffic on the network, the rate of + connection failures due to protocol errors, and load on network + servers due to time spent processing both unsuccessful connections + and retransmitted data. This will help to ensure the stability of + the global Internet. + + Each problem is defined as follows: + + Name of Problem + The name associated with the problem. In this memo, the name is + given as a subsection heading. + + Classification + One or more problem categories for which the problem is + classified: "congestion control", "performance", "reliability", + "resource management". + + Description + A definition of the problem, succinct but including necessary + background material. + + Significance + A brief summary of the sorts of environments for which the problem + is significant. + + + + + +Paxson, et. al. Informational [Page 2] + +RFC 2525 TCP Implementation Problems March 1999 + + + Implications + Why the problem is viewed as a problem. + + Relevant RFCs + The RFCs defining the TCP specification with which the problem + conflicts. These RFCs often qualify behavior using terms such as + MUST, SHOULD, MAY, and others written capitalized. See RFC 2119 + for the exact interpretation of these terms. + + Trace file demonstrating the problem + One or more ASCII trace files demonstrating the problem, if + applicable. + + Trace file demonstrating correct behavior + One or more examples of how correct behavior appears in a trace, + if applicable. + + References + References that further discuss the problem. + + How to detect + How to test an implementation to see if it exhibits the problem. + This discussion may include difficulties and subtleties associated + with causing the problem to manifest itself, and with interpreting + traces to detect the presence of the problem (if applicable). + + How to fix + For known causes of the problem, how to correct the + implementation. + +2. Known implementation problems + +2.1. + + Name of Problem + No initial slow start + + Classification + Congestion control + + Description + When a TCP begins transmitting data, it is required by RFC 1122, + 4.2.2.15, to engage in a "slow start" by initializing its + congestion window, cwnd, to one packet (one segment of the maximum + size). (Note that an experimental change to TCP, documented in + [RFC2414], allows an initial value somewhat larger than one + packet.) It subsequently increases cwnd by one packet for each + ACK it receives for new data. The minimum of cwnd and the + + + +Paxson, et. al. Informational [Page 3] + +RFC 2525 TCP Implementation Problems March 1999 + + + receiver's advertised window bounds the highest sequence number + the TCP can transmit. A TCP that fails to initialize and + increment cwnd in this fashion exhibits "No initial slow start". + + Significance + In congested environments, detrimental to the performance of other + connections, and possibly to the connection itself. + + Implications + A TCP failing to slow start when beginning a connection results in + traffic bursts that can stress the network, leading to excessive + queueing delays and packet loss. + + Implementations exhibiting this problem might do so because they + suffer from the general problem of not including the required + congestion window. These implementations will also suffer from + "No slow start after retransmission timeout". + + There are different shades of "No initial slow start". From the + perspective of stressing the network, the worst is a connection + that simply always sends based on the receiver's advertised + window, with no notion of a separate congestion window. Another + form is described in "Uninitialized CWND" below. + + Relevant RFCs + RFC 1122 requires use of slow start. RFC 2001 gives the specifics + of slow start. + + Trace file demonstrating it + Made using tcpdump [Jacobson89] recording at the connection + responder. No losses reported by the packet filter. + + 10:40:42.244503 B > A: S 1168512000:1168512000(0) win 32768 + (DF) [tos 0x8] + 10:40:42.259908 A > B: S 3688169472:3688169472(0) + ack 1168512001 win 32768 + 10:40:42.389992 B > A: . ack 1 win 33580 (DF) [tos 0x8] + 10:40:42.664975 A > B: P 1:513(512) ack 1 win 32768 + 10:40:42.700185 A > B: . 513:1973(1460) ack 1 win 32768 + 10:40:42.718017 A > B: . 1973:3433(1460) ack 1 win 32768 + 10:40:42.762945 A > B: . 3433:4893(1460) ack 1 win 32768 + 10:40:42.811273 A > B: . 4893:6353(1460) ack 1 win 32768 + 10:40:42.829149 A > B: . 6353:7813(1460) ack 1 win 32768 + 10:40:42.853687 B > A: . ack 1973 win 33580 (DF) [tos 0x8] + 10:40:42.864031 B > A: . ack 3433 win 33580 (DF) [tos 0x8] + + + + + + +Paxson, et. al. Informational [Page 4] + +RFC 2525 TCP Implementation Problems March 1999 + + + After the third packet, the connection is established. A, the + connection responder, begins transmitting to B, the connection + initiator. Host A quickly sends 6 packets comprising 7812 bytes, + even though the SYN exchange agreed upon an MSS of 1460 bytes + (implying an initial congestion window of 1 segment corresponds to + 1460 bytes), and so A should have sent at most 1460 bytes. + + The ACKs sent by B to A in the last two lines indicate that this + trace is not a measurement error (slow start really occurring but + the corresponding ACKs having been dropped by the packet filter). + + A second trace confirmed that the problem is repeatable. + + Trace file demonstrating correct behavior + Made using tcpdump recording at the connection originator. No + losses reported by the packet filter. + + 12:35:31.914050 C > D: S 1448571845:1448571845(0) + win 4380 + 12:35:32.068819 D > C: S 1755712000:1755712000(0) + ack 1448571846 win 4096 + 12:35:32.069341 C > D: . ack 1 win 4608 + 12:35:32.075213 C > D: P 1:513(512) ack 1 win 4608 + 12:35:32.286073 D > C: . ack 513 win 4096 + 12:35:32.287032 C > D: . 513:1025(512) ack 1 win 4608 + 12:35:32.287506 C > D: . 1025:1537(512) ack 1 win 4608 + 12:35:32.432712 D > C: . ack 1537 win 4096 + 12:35:32.433690 C > D: . 1537:2049(512) ack 1 win 4608 + 12:35:32.434481 C > D: . 2049:2561(512) ack 1 win 4608 + 12:35:32.435032 C > D: . 2561:3073(512) ack 1 win 4608 + 12:35:32.594526 D > C: . ack 3073 win 4096 + 12:35:32.595465 C > D: . 3073:3585(512) ack 1 win 4608 + 12:35:32.595947 C > D: . 3585:4097(512) ack 1 win 4608 + 12:35:32.596414 C > D: . 4097:4609(512) ack 1 win 4608 + 12:35:32.596888 C > D: . 4609:5121(512) ack 1 win 4608 + 12:35:32.733453 D > C: . ack 4097 win 4096 + + References + This problem is documented in [Paxson97]. + + How to detect + For implementations always manifesting this problem, it shows up + immediately in a packet trace or a sequence plot, as illustrated + above. + + + + + + + +Paxson, et. al. Informational [Page 5] + +RFC 2525 TCP Implementation Problems March 1999 + + + How to fix + If the root problem is that the implementation lacks a notion of a + congestion window, then unfortunately this requires significant + work to fix. However, doing so is important, as such + implementations also exhibit "No slow start after retransmission + timeout". + +2.2. + + Name of Problem + No slow start after retransmission timeout + + Classification + Congestion control + + Description + When a TCP experiences a retransmission timeout, it is required by + RFC 1122, 4.2.2.15, to engage in "slow start" by initializing its + congestion window, cwnd, to one packet (one segment of the maximum + size). It subsequently increases cwnd by one packet for each ACK + it receives for new data until it reaches the "congestion + avoidance" threshold, ssthresh, at which point the congestion + avoidance algorithm for updating the window takes over. A TCP + that fails to enter slow start upon a timeout exhibits "No slow + start after retransmission timeout". + + Significance + In congested environments, severely detrimental to the performance + of other connections, and also the connection itself. + + Implications + Entering slow start upon timeout forms one of the cornerstones of + Internet congestion stability, as outlined in [Jacobson88]. If + TCPs fail to do so, the network becomes at risk of suffering + "congestion collapse" [RFC896]. + + Relevant RFCs + RFC 1122 requires use of slow start after loss. RFC 2001 gives + the specifics of how to implement slow start. RFC 896 describes + congestion collapse. + + The retransmission timeout discussed here should not be confused + with the separate "fast recovery" retransmission mechanism + discussed in RFC 2001. + + Trace file demonstrating it + Made using tcpdump recording at the sending TCP (A). No losses + reported by the packet filter. + + + +Paxson, et. al. Informational [Page 6] + +RFC 2525 TCP Implementation Problems March 1999 + + + 10:40:59.090612 B > A: . ack 357125 win 33580 (DF) [tos 0x8] + 10:40:59.222025 A > B: . 357125:358585(1460) ack 1 win 32768 + 10:40:59.868871 A > B: . 357125:358585(1460) ack 1 win 32768 + 10:41:00.016641 B > A: . ack 364425 win 33580 (DF) [tos 0x8] + 10:41:00.036709 A > B: . 364425:365885(1460) ack 1 win 32768 + 10:41:00.045231 A > B: . 365885:367345(1460) ack 1 win 32768 + 10:41:00.053785 A > B: . 367345:368805(1460) ack 1 win 32768 + 10:41:00.062426 A > B: . 368805:370265(1460) ack 1 win 32768 + 10:41:00.071074 A > B: . 370265:371725(1460) ack 1 win 32768 + 10:41:00.079794 A > B: . 371725:373185(1460) ack 1 win 32768 + 10:41:00.089304 A > B: . 373185:374645(1460) ack 1 win 32768 + 10:41:00.097738 A > B: . 374645:376105(1460) ack 1 win 32768 + 10:41:00.106409 A > B: . 376105:377565(1460) ack 1 win 32768 + 10:41:00.115024 A > B: . 377565:379025(1460) ack 1 win 32768 + 10:41:00.123576 A > B: . 379025:380485(1460) ack 1 win 32768 + 10:41:00.132016 A > B: . 380485:381945(1460) ack 1 win 32768 + 10:41:00.141635 A > B: . 381945:383405(1460) ack 1 win 32768 + 10:41:00.150094 A > B: . 383405:384865(1460) ack 1 win 32768 + 10:41:00.158552 A > B: . 384865:386325(1460) ack 1 win 32768 + 10:41:00.167053 A > B: . 386325:387785(1460) ack 1 win 32768 + 10:41:00.175518 A > B: . 387785:389245(1460) ack 1 win 32768 + 10:41:00.210835 A > B: . 389245:390705(1460) ack 1 win 32768 + 10:41:00.226108 A > B: . 390705:392165(1460) ack 1 win 32768 + 10:41:00.241524 B > A: . ack 389245 win 8760 (DF) [tos 0x8] + + The first packet indicates the ack point is 357125. 130 msec + after receiving the ACK, A transmits the packet after the ACK + point, 357125:358585. 640 msec after this transmission, it + retransmits 357125:358585, in an apparent retransmission timeout. + At this point, A's cwnd should be one MSS, or 1460 bytes, as A + enters slow start. The trace is consistent with this possibility. + + B replies with an ACK of 364425, indicating that A has filled a + sequence hole. At this point, A's cwnd should be 1460*2 = 2920 + bytes, since in slow start receiving an ACK advances cwnd by MSS. + However, A then launches 19 consecutive packets, which is + inconsistent with slow start. + + A second trace confirmed that the problem is repeatable. + + Trace file demonstrating correct behavior + Made using tcpdump recording at the sending TCP (C). No losses + reported by the packet filter. + + 12:35:48.442538 C > D: P 465409:465921(512) ack 1 win 4608 + 12:35:48.544483 D > C: . ack 461825 win 4096 + 12:35:48.703496 D > C: . ack 461825 win 4096 + 12:35:49.044613 C > D: . 461825:462337(512) ack 1 win 4608 + + + +Paxson, et. al. Informational [Page 7] + +RFC 2525 TCP Implementation Problems March 1999 + + + 12:35:49.192282 D > C: . ack 465921 win 2048 + 12:35:49.192538 D > C: . ack 465921 win 4096 + 12:35:49.193392 C > D: P 465921:466433(512) ack 1 win 4608 + 12:35:49.194726 C > D: P 466433:466945(512) ack 1 win 4608 + 12:35:49.350665 D > C: . ack 466945 win 4096 + 12:35:49.351694 C > D: . 466945:467457(512) ack 1 win 4608 + 12:35:49.352168 C > D: . 467457:467969(512) ack 1 win 4608 + 12:35:49.352643 C > D: . 467969:468481(512) ack 1 win 4608 + 12:35:49.506000 D > C: . ack 467969 win 3584 + + After C transmits the first packet shown to D, it takes no action + in response to D's ACKs for 461825, because the first packet + already reached the advertised window limit of 4096 bytes above + 461825. 600 msec after transmitting the first packet, C + retransmits 461825:462337, presumably due to a timeout. Its + congestion window is now MSS (512 bytes). + + D acks 465921, indicating that C's retransmission filled a + sequence hole. This ACK advances C's cwnd from 512 to 1024. Very + shortly after, D acks 465921 again in order to update the offered + window from 2048 to 4096. This ACK does not advance cwnd since it + is not for new data. Very shortly after, C responds to the newly + enlarged window by transmitting two packets. D acks both, + advancing cwnd from 1024 to 1536. C in turn transmits three + packets. + + References + This problem is documented in [Paxson97]. + + How to detect + Packet loss is common enough in the Internet that generally it is + not difficult to find an Internet path that will force + retransmission due to packet loss. + + If the effective window prior to loss is large enough, however, + then the TCP may retransmit using the "fast recovery" mechanism + described in RFC 2001. In a packet trace, the signature of fast + recovery is that the packet retransmission occurs in response to + the receipt of three duplicate ACKs, and subsequent duplicate ACKs + may lead to the transmission of new data, above both the ack point + and the highest sequence transmitted so far. An absence of three + duplicate ACKs prior to retransmission suffices to distinguish + between timeout and fast recovery retransmissions. In the face of + only observing fast recovery retransmissions, generally it is not + difficult to repeat the data transfer until observing a timeout + retransmission. + + + + + +Paxson, et. al. Informational [Page 8] + +RFC 2525 TCP Implementation Problems March 1999 + + + Once armed with a trace exhibiting a timeout retransmission, + determining whether the TCP follows slow start is done by + computing the correct progression of cwnd and comparing it to the + amount of data transmitted by the TCP subsequent to the timeout + retransmission. + + How to fix + If the root problem is that the implementation lacks a notion of a + congestion window, then unfortunately this requires significant + work to fix. However, doing so is critical, for reasons outlined + above. + +2.3. + + Name of Problem + Uninitialized CWND + + Classification + Congestion control + + Description + As described above for "No initial slow start", when a TCP + connection begins cwnd is initialized to one segment (or perhaps a + few segments, if experimenting with [RFC2414]). One particular + form of "No initial slow start", worth separate mention as the bug + is fairly widely deployed, is "Uninitialized CWND". That is, + while the TCP implements the proper slow start mechanism, it fails + to initialize cwnd properly, so slow start in fact fails to occur. + + One way the bug can occur is if, during the connection + establishment handshake, the SYN ACK packet arrives without an MSS + option. The faulty implementation uses receipt of the MSS option + to initialize cwnd to one segment; if the option fails to arrive, + then cwnd is instead initialized to a very large value. + + Significance + In congested environments, detrimental to the performance of other + connections, and likely to the connection itself. The burst can + be so large (see below) that it has deleterious effects even in + uncongested environments. + + Implications + A TCP exhibiting this behavior is stressing the network with a + large burst of packets, which can cause loss in the network. + + Relevant RFCs + RFC 1122 requires use of slow start. RFC 2001 gives the specifics + of slow start. + + + +Paxson, et. al. Informational [Page 9] + +RFC 2525 TCP Implementation Problems March 1999 + + + Trace file demonstrating it + This trace was made using tcpdump running on host A. Host A is + the sender and host B is the receiver. The advertised window and + timestamp options have been omitted for clarity, except for the + first segment sent by host A. Note that A sends an MSS option in + its initial SYN but B does not include one in its reply. + + 16:56:02.226937 A > B: S 237585307:237585307(0) win 8192 + + 16:56:02.557135 B > A: S 1617216000:1617216000(0) + ack 237585308 win 16384 + 16:56:02.557788 A > B: . ack 1 win 8192 + 16:56:02.566014 A > B: . 1:537(536) ack 1 + 16:56:02.566557 A > B: . 537:1073(536) ack 1 + 16:56:02.567120 A > B: . 1073:1609(536) ack 1 + 16:56:02.567662 A > B: P 1609:2049(440) ack 1 + 16:56:02.568349 A > B: . 2049:2585(536) ack 1 + 16:56:02.568909 A > B: . 2585:3121(536) ack 1 + + [54 additional burst segments deleted for brevity] + + 16:56:02.936638 A > B: . 32065:32601(536) ack 1 + 16:56:03.018685 B > A: . ack 1 + + After the three-way handshake, host A bursts 61 segments into the + network, before duplicate ACKs on the first segment cause a + retransmission to occur. Since host A did not wait for the ACK on + the first segment before sending additional segments, it is + exhibiting "Uninitialized CWND" + + Trace file demonstrating correct behavior + + See the example for "No initial slow start". + + References + This problem is documented in [Paxson97]. + + How to detect + This problem can be detected by examining a packet trace recorded + at either the sender or the receiver. However, the bug can be + difficult to induce because it requires finding a remote TCP peer + that does not send an MSS option in its SYN ACK. + + How to fix + This problem can be fixed by ensuring that cwnd is initialized + upon receipt of a SYN ACK, even if the SYN ACK does not contain an + MSS option. + + + + +Paxson, et. al. Informational [Page 10] + +RFC 2525 TCP Implementation Problems March 1999 + + +2.4. + + Name of Problem + Inconsistent retransmission + + Classification + Reliability + + Description + If, for a given sequence number, a sending TCP retransmits + different data than previously sent for that sequence number, then + a strong possibility arises that the receiving TCP will + reconstruct a different byte stream than that sent by the sending + application, depending on which instance of the sequence number it + accepts. + + Such a sending TCP exhibits "Inconsistent retransmission". + + Significance + Critical for all environments. + + Implications + Reliable delivery of data is a fundamental property of TCP. + + Relevant RFCs + RFC 793, section 1.5, discusses the central role of reliability in + TCP operation. + + Trace file demonstrating it + Made using tcpdump recording at the receiving TCP (B). No losses + reported by the packet filter. + + 12:35:53.145503 A > B: FP 90048435:90048461(26) + ack 393464682 win 4096 + 4500 0042 9644 0000 + 3006 e4c2 86b1 0401 83f3 010a b2a4 0015 + 055e 07b3 1773 cb6a 5019 1000 68a9 0000 + data starts here>504f 5254 2031 3334 2c31 3737*2c34 2c31 + 2c31 3738 2c31 3635 0d0a + 12:35:53.146479 B > A: R 393464682:393464682(0) win 8192 + 12:35:53.851714 A > B: FP 90048429:90048463(34) + ack 393464682 win 4096 + 4500 004a 965b 0000 + 3006 e4a3 86b1 0401 83f3 010a b2a4 0015 + 055e 07ad 1773 cb6a 5019 1000 8bd3 0000 + data starts here>5041 5356 0d0a 504f 5254 2031 3334 2c31 + 3737*2c31 3035 2c31 3431 2c34 2c31 3539 + 0d0a + + + +Paxson, et. al. Informational [Page 11] + +RFC 2525 TCP Implementation Problems March 1999 + + + The sequence numbers shown in this trace are absolute and not + adjusted to reflect the ISN. The 4-digit hex values show a dump + of the packet's IP and TCP headers, as well as payload. A first + sends to B data for 90048435:90048461. The corresponding data + begins with hex words 504f, 5254, etc. + + B responds with a RST. Since the recording location was local to + B, it is unknown whether A received the RST. + + A then sends 90048429:90048463, which includes six sequence + positions below the earlier transmission, all 26 positions of the + earlier transmission, and two additional sequence positions. + + The retransmission disagrees starting just after sequence + 90048447, annotated above with a leading '*'. These two bytes + were originally transmitted as hex 2c34 but retransmitted as hex + 2c31. Subsequent positions disagree as well. + + This behavior has been observed in other traces involving + different hosts. It is unknown how to repeat it. + + In this instance, no corruption would occur, since B has already + indicated it will not accept further packets from A. + + A second example illustrates a slightly different instance of the + problem. The tracing again was made with tcpdump at the receiving + TCP (D). + + 22:23:58.645829 C > D: P 185:212(27) ack 565 win 4096 + 4500 0043 90a3 0000 + 3306 0734 cbf1 9eef 83f3 010a 0525 0015 + a3a2 faba 578c 70a4 5018 1000 9a53 0000 + data starts here>504f 5254 2032 3033 2c32 3431 2c31 3538 + 2c32 3339 2c35 2c34 330d 0a + 22:23:58.646805 D > C: . ack 184 win 8192 + 4500 0028 beeb 0000 + 3e06 ce06 83f3 010a cbf1 9eef 0015 0525 + 578c 70a4 a3a2 fab9 5010 2000 342f 0000 + 22:31:36.532244 C > D: FP 186:213(27) ack 565 win 4096 + 4500 0043 9435 0000 + 3306 03a2 cbf1 9eef 83f3 010a 0525 0015 + a3a2 fabb 578c 70a4 5019 1000 9a51 0000 + data starts here>504f 5254 2032 3033 2c32 3431 2c31 3538 + 2c32 3339 2c35 2c34 330d 0a + + + + + + + +Paxson, et. al. Informational [Page 12] + +RFC 2525 TCP Implementation Problems March 1999 + + + In this trace, sequence numbers are relative. C sends 185:212, + but D only sends an ACK for 184 (so sequence number 184 is + missing). C then sends 186:213. The packet payload is identical + to the previous payload, but the base sequence number is one + higher, resulting in an inconsistent retransmission. + + Neither trace exhibits checksum errors. + + Trace file demonstrating correct behavior + (Omitted, as presumably correct behavior is obvious.) + + References + None known. + + How to detect + This problem unfortunately can be very difficult to detect, since + available experience indicates it is quite rare that it is + manifested. No "trigger" has been identified that can be used to + reproduce the problem. + + How to fix + In the absence of a known "trigger", we cannot always assess how + to fix the problem. + + In one implementation (not the one illustrated above), the problem + manifested itself when (1) the sender received a zero window and + stalled; (2) eventually an ACK arrived that offered a window + larger than that in effect at the time of the stall; (3) the + sender transmitted out of the buffer of data it held at the time + of the stall, but (4) failed to limit this transfer to the buffer + length, instead using the newly advertised (and larger) offered + window. Consequently, in addition to the valid buffer contents, + it sent whatever garbage values followed the end of the buffer. + If it then retransmitted the corresponding sequence numbers, at + that point it sent the correct data, resulting in an inconsistent + retransmission. Note that this instance of the problem reflects a + more general problem, that of initially transmitting incorrect + data. + +2.5. + + Name of Problem + Failure to retain above-sequence data + + Classification + Congestion control, performance + + + + + +Paxson, et. al. Informational [Page 13] + +RFC 2525 TCP Implementation Problems March 1999 + + + Description + When a TCP receives an "above sequence" segment, meaning one with + a sequence number exceeding RCV.NXT but below RCV.NXT+RCV.WND, it + SHOULD queue the segment for later delivery (RFC 1122, 4.2.2.20). + (See RFC 793 for the definition of RCV.NXT and RCV.WND.) A TCP + that fails to do so is said to exhibit "Failure to retain above- + sequence data". + + It may sometimes be appropriate for a TCP to discard above- + sequence data to reclaim memory. If they do so only rarely, then + we would not consider them to exhibit this problem. Instead, the + particular concern is with TCPs that always discard above-sequence + data. + + Significance + In environments prone to packet loss, detrimental to the + performance of both other connections and the connection itself. + + Implications + In times of congestion, a failure to retain above-sequence data + will lead to numerous otherwise-unnecessary retransmissions, + aggravating the congestion and potentially reducing performance by + a large factor. + + Relevant RFCs + RFC 1122 revises RFC 793 by upgrading the latter's MAY to a SHOULD + on this issue. + + Trace file demonstrating it + Made using tcpdump recording at the receiving TCP. No losses + reported by the packet filter. + + B is the TCP sender, A the receiver. A exhibits failure to retain + above sequence-data: + + 10:38:10.164860 B > A: . 221078:221614(536) ack 1 win 33232 [tos 0x8] + 10:38:10.170809 B > A: . 221614:222150(536) ack 1 win 33232 [tos 0x8] + 10:38:10.177183 B > A: . 222150:222686(536) ack 1 win 33232 [tos 0x8] + 10:38:10.225039 A > B: . ack 222686 win 25800 + + Here B has sent up to (relative) sequence 222686 in-sequence, and + A accordingly acknowledges. + + 10:38:10.268131 B > A: . 223222:223758(536) ack 1 win 33232 [tos 0x8] + 10:38:10.337995 B > A: . 223758:224294(536) ack 1 win 33232 [tos 0x8] + 10:38:10.344065 B > A: . 224294:224830(536) ack 1 win 33232 [tos 0x8] + 10:38:10.350169 B > A: . 224830:225366(536) ack 1 win 33232 [tos 0x8] + 10:38:10.356362 B > A: . 225366:225902(536) ack 1 win 33232 [tos 0x8] + + + +Paxson, et. al. Informational [Page 14] + +RFC 2525 TCP Implementation Problems March 1999 + + + 10:38:10.362445 B > A: . 225902:226438(536) ack 1 win 33232 [tos 0x8] + 10:38:10.368579 B > A: . 226438:226974(536) ack 1 win 33232 [tos 0x8] + 10:38:10.374732 B > A: . 226974:227510(536) ack 1 win 33232 [tos 0x8] + 10:38:10.380825 B > A: . 227510:228046(536) ack 1 win 33232 [tos 0x8] + 10:38:10.387027 B > A: . 228046:228582(536) ack 1 win 33232 [tos 0x8] + 10:38:10.393053 B > A: . 228582:229118(536) ack 1 win 33232 [tos 0x8] + 10:38:10.399193 B > A: . 229118:229654(536) ack 1 win 33232 [tos 0x8] + 10:38:10.405356 B > A: . 229654:230190(536) ack 1 win 33232 [tos 0x8] + + A now receives 13 additional packets from B. These are above- + sequence because 222686:223222 was dropped. The packets do + however fit within the offered window of 25800. A does not + generate any duplicate ACKs for them. + + The trace contributor (V. Paxson) verified that these 13 packets + had valid IP and TCP checksums. + + 10:38:11.917728 B > A: . 222686:223222(536) ack 1 win 33232 [tos 0x8] + 10:38:11.930925 A > B: . ack 223222 win 32232 + + B times out for 222686:223222 and retransmits it. Upon receiving + it, A only acknowledges 223222. Had it retained the valid above- + sequence packets, it would instead have ack'd 230190. + + 10:38:12.048438 B > A: . 223222:223758(536) ack 1 win 33232 [tos 0x8] + 10:38:12.054397 B > A: . 223758:224294(536) ack 1 win 33232 [tos 0x8] + 10:38:12.068029 A > B: . ack 224294 win 31696 + + B retransmits two more packets, and A only acknowledges them. + This pattern continues as B retransmits the entire set of + previously-received packets. + + A second trace confirmed that the problem is repeatable. + + Trace file demonstrating correct behavior + Made using tcpdump recording at the receiving TCP (C). No losses + reported by the packet filter. + + 09:11:25.790417 D > C: . 33793:34305(512) ack 1 win 61440 + 09:11:25.791393 D > C: . 34305:34817(512) ack 1 win 61440 + 09:11:25.792369 D > C: . 34817:35329(512) ack 1 win 61440 + 09:11:25.792369 D > C: . 35329:35841(512) ack 1 win 61440 + 09:11:25.793345 D > C: . 36353:36865(512) ack 1 win 61440 + 09:11:25.794321 C > D: . ack 35841 win 59904 + + A sequence hole occurs because 35841:36353 has been dropped. + + + + + +Paxson, et. al. Informational [Page 15] + +RFC 2525 TCP Implementation Problems March 1999 + + + 09:11:25.794321 D > C: . 36865:37377(512) ack 1 win 61440 + 09:11:25.794321 C > D: . ack 35841 win 59904 + 09:11:25.795297 D > C: . 37377:37889(512) ack 1 win 61440 + 09:11:25.795297 C > D: . ack 35841 win 59904 + 09:11:25.796273 C > D: . ack 35841 win 61440 + 09:11:25.798225 D > C: . 37889:38401(512) ack 1 win 61440 + 09:11:25.799201 C > D: . ack 35841 win 61440 + 09:11:25.807009 D > C: . 38401:38913(512) ack 1 win 61440 + 09:11:25.807009 C > D: . ack 35841 win 61440 + (many additional lines omitted) + 09:11:25.884113 D > C: . 52737:53249(512) ack 1 win 61440 + 09:11:25.884113 C > D: . ack 35841 win 61440 + + Each additional, above-sequence packet C receives from D elicits a + duplicate ACK for 35841. + + 09:11:25.887041 D > C: . 35841:36353(512) ack 1 win 61440 + 09:11:25.887041 C > D: . ack 53249 win 44032 + + D retransmits 35841:36353 and C acknowledges receipt of data all + the way up to 53249. + + References + This problem is documented in [Paxson97]. + + How to detect + Packet loss is common enough in the Internet that generally it is + not difficult to find an Internet path that will result in some + above-sequence packets arriving. A TCP that exhibits "Failure to + retain ..." may not generate duplicate ACKs for these packets. + However, some TCPs that do retain above-sequence data also do not + generate duplicate ACKs, so failure to do so does not definitively + identify the problem. Instead, the key observation is whether + upon retransmission of the dropped packet, data that was + previously above-sequence is acknowledged. + + Two considerations in detecting this problem using a packet trace + are that it is easiest to do so with a trace made at the TCP + receiver, in order to unambiguously determine which packets + arrived successfully, and that such packets may still be correctly + discarded if they arrive with checksum errors. The latter can be + tested by capturing the entire packet contents and performing the + IP and TCP checksum algorithms to verify their integrity; or by + confirming that the packets arrive with the same checksum and + contents as that with which they were sent, with a presumption + that the sending TCP correctly calculates checksums for the + packets it transmits. + + + + +Paxson, et. al. Informational [Page 16] + +RFC 2525 TCP Implementation Problems March 1999 + + + It is considerably easier to verify that an implementation does + NOT exhibit this problem. This can be done by recording a trace + at the data sender, and observing that sometimes after a + retransmission the receiver acknowledges a higher sequence number + than just that which was retransmitted. + + How to fix + If the root problem is that the implementation lacks buffer, then + then unfortunately this requires significant work to fix. + However, doing so is important, for reasons outlined above. + +2.6. + + Name of Problem + Extra additive constant in congestion avoidance + + Classification + Congestion control / performance + + Description + RFC 1122 section 4.2.2.15 states that TCP MUST implement + Jacobson's "congestion avoidance" algorithm [Jacobson88], which + calls for increasing the congestion window, cwnd, by: + + MSS * MSS / cwnd + + for each ACK received for new data [RFC2001]. This has the effect + of increasing cwnd by approximately one segment in each round trip + time. + + Some TCP implementations add an additional fraction of a segment + (typically MSS/8) to cwnd for each ACK received for new data + [Stevens94, Wright95]: + + (MSS * MSS / cwnd) + MSS/8 + + These implementations exhibit "Extra additive constant in + congestion avoidance". + + Significance + May be detrimental to performance even in completely uncongested + environments (see Implications). + + In congested environments, may also be detrimental to the + performance of other connections. + + + + + + +Paxson, et. al. Informational [Page 17] + +RFC 2525 TCP Implementation Problems March 1999 + + + Implications + The extra additive term allows a TCP to more aggressively open its + congestion window (quadratic rather than linear increase). For + congested networks, this can increase the loss rate experienced by + all connections sharing a bottleneck with the aggressive TCP. + + However, even for completely uncongested networks, the extra + additive term can lead to diminished performance, as follows. In + congestion avoidance, a TCP sender probes the network path to + determine its available capacity, which often equates to the + number of buffers available at a bottleneck link. With linear + congestion avoidance, the TCP only probes for sufficient capacity + (buffer) to hold one extra packet per RTT. + + Thus, when it exceeds the available capacity, generally only one + packet will be lost (since on the previous RTT it already found + that the path could sustain a window with one less packet in + flight). If the congestion window is sufficiently large, then the + TCP will recover from this single loss using fast retransmission + and avoid an expensive (in terms of performance) retransmission + timeout. + + However, when the additional additive term is used, then cwnd can + increase by more than one packet per RTT, in which case the TCP + probes more aggressively. If in the previous RTT it had reached + the available capacity of the path, then the excess due to the + extra increase will again be lost, but now this will result in + multiple losses from the flight instead of a single loss. TCPs + that do not utilize SACK [RFC2018] generally will not recover from + multiple losses without incurring a retransmission timeout + [Fall96,Hoe96], significantly diminishing performance. + + Relevant RFCs + RFC 1122 requires use of the "congestion avoidance" algorithm. + RFC 2001 outlines the fast retransmit/fast recovery algorithms. + RFC 2018 discusses the SACK option. + + Trace file demonstrating it + Recorded using tcpdump running on the same FDDI LAN as host A. + Host A is the sender and host B is the receiver. The connection + establishment specified an MSS of 4,312 bytes and a window scale + factor of 4. We omit the establishment and the first 2.5 MB of + data transfer, as the problem is best demonstrated when the window + has grown to a large value. At the beginning of the trace + excerpt, the congestion window is 31 packets. The connection is + never receiver-window limited, so we omit window advertisements + from the trace for clarity. + + + + +Paxson, et. al. Informational [Page 18] + +RFC 2525 TCP Implementation Problems March 1999 + + + 11:42:07.697951 B > A: . ack 2383006 + 11:42:07.699388 A > B: . 2508054:2512366(4312) + 11:42:07.699962 A > B: . 2512366:2516678(4312) + 11:42:07.700012 B > A: . ack 2391630 + 11:42:07.701081 A > B: . 2516678:2520990(4312) + 11:42:07.701656 A > B: . 2520990:2525302(4312) + 11:42:07.701739 B > A: . ack 2400254 + 11:42:07.702685 A > B: . 2525302:2529614(4312) + 11:42:07.703257 A > B: . 2529614:2533926(4312) + 11:42:07.703295 B > A: . ack 2408878 + 11:42:07.704414 A > B: . 2533926:2538238(4312) + 11:42:07.704989 A > B: . 2538238:2542550(4312) + 11:42:07.705040 B > A: . ack 2417502 + 11:42:07.705935 A > B: . 2542550:2546862(4312) + 11:42:07.706506 A > B: . 2546862:2551174(4312) + 11:42:07.706544 B > A: . ack 2426126 + 11:42:07.707480 A > B: . 2551174:2555486(4312) + 11:42:07.708051 A > B: . 2555486:2559798(4312) + 11:42:07.708088 B > A: . ack 2434750 + 11:42:07.709030 A > B: . 2559798:2564110(4312) + 11:42:07.709604 A > B: . 2564110:2568422(4312) + 11:42:07.710175 A > B: . 2568422:2572734(4312) * + + 11:42:07.710215 B > A: . ack 2443374 + 11:42:07.710799 A > B: . 2572734:2577046(4312) + 11:42:07.711368 A > B: . 2577046:2581358(4312) + 11:42:07.711405 B > A: . ack 2451998 + 11:42:07.712323 A > B: . 2581358:2585670(4312) + 11:42:07.712898 A > B: . 2585670:2589982(4312) + 11:42:07.712938 B > A: . ack 2460622 + 11:42:07.713926 A > B: . 2589982:2594294(4312) + 11:42:07.714501 A > B: . 2594294:2598606(4312) + 11:42:07.714547 B > A: . ack 2469246 + 11:42:07.715747 A > B: . 2598606:2602918(4312) + 11:42:07.716287 A > B: . 2602918:2607230(4312) + 11:42:07.716328 B > A: . ack 2477870 + 11:42:07.717146 A > B: . 2607230:2611542(4312) + 11:42:07.717717 A > B: . 2611542:2615854(4312) + 11:42:07.717762 B > A: . ack 2486494 + 11:42:07.718754 A > B: . 2615854:2620166(4312) + 11:42:07.719331 A > B: . 2620166:2624478(4312) + 11:42:07.719906 A > B: . 2624478:2628790(4312) ** + + 11:42:07.719958 B > A: . ack 2495118 + 11:42:07.720500 A > B: . 2628790:2633102(4312) + 11:42:07.721080 A > B: . 2633102:2637414(4312) + 11:42:07.721739 B > A: . ack 2503742 + 11:42:07.722348 A > B: . 2637414:2641726(4312) + + + +Paxson, et. al. Informational [Page 19] + +RFC 2525 TCP Implementation Problems March 1999 + + + 11:42:07.722918 A > B: . 2641726:2646038(4312) + 11:42:07.769248 B > A: . ack 2512366 + + The receiver's acknowledgment policy is one ACK per two packets + received. Thus, for each ACK arriving at host A, two new packets + are sent, except when cwnd increases due to congestion avoidance, + in which case three new packets are sent. + + With an ack-every-two-packets policy, cwnd should only increase + one MSS per 2 RTT. However, at the point marked "*" the window + increases after 7 ACKs have arrived, and then again at "**" after + 6 more ACKs. + + While we do not have space to show the effect, this trace suffered + from repeated timeout retransmissions due to multiple packet + losses during a single RTT. + + Trace file demonstrating correct behavior + Made using the same host and tracing setup as above, except now + A's TCP has been modified to remove the MSS/8 additive constant. + Tcpdump reported 77 packet drops; the excerpt below is fully + self-consistent so it is unlikely that any of these occurred + during the excerpt. + + We again begin when cwnd is 31 packets (this occurs significantly + later in the trace, because the congestion avoidance is now less + aggressive with opening the window). + + 14:22:21.236757 B > A: . ack 5194679 + 14:22:21.238192 A > B: . 5319727:5324039(4312) + 14:22:21.238770 A > B: . 5324039:5328351(4312) + 14:22:21.238821 B > A: . ack 5203303 + 14:22:21.240158 A > B: . 5328351:5332663(4312) + 14:22:21.240738 A > B: . 5332663:5336975(4312) + 14:22:21.270422 B > A: . ack 5211927 + 14:22:21.271883 A > B: . 5336975:5341287(4312) + 14:22:21.272458 A > B: . 5341287:5345599(4312) + 14:22:21.279099 B > A: . ack 5220551 + 14:22:21.280539 A > B: . 5345599:5349911(4312) + 14:22:21.281118 A > B: . 5349911:5354223(4312) + 14:22:21.281183 B > A: . ack 5229175 + 14:22:21.282348 A > B: . 5354223:5358535(4312) + 14:22:21.283029 A > B: . 5358535:5362847(4312) + 14:22:21.283089 B > A: . ack 5237799 + 14:22:21.284213 A > B: . 5362847:5367159(4312) + 14:22:21.284779 A > B: . 5367159:5371471(4312) + 14:22:21.285976 B > A: . ack 5246423 + 14:22:21.287465 A > B: . 5371471:5375783(4312) + + + +Paxson, et. al. Informational [Page 20] + +RFC 2525 TCP Implementation Problems March 1999 + + + 14:22:21.288036 A > B: . 5375783:5380095(4312) + 14:22:21.288073 B > A: . ack 5255047 + 14:22:21.289155 A > B: . 5380095:5384407(4312) + 14:22:21.289725 A > B: . 5384407:5388719(4312) + 14:22:21.289762 B > A: . ack 5263671 + 14:22:21.291090 A > B: . 5388719:5393031(4312) + 14:22:21.291662 A > B: . 5393031:5397343(4312) + 14:22:21.291701 B > A: . ack 5272295 + 14:22:21.292870 A > B: . 5397343:5401655(4312) + 14:22:21.293441 A > B: . 5401655:5405967(4312) + 14:22:21.293481 B > A: . ack 5280919 + 14:22:21.294476 A > B: . 5405967:5410279(4312) + 14:22:21.295053 A > B: . 5410279:5414591(4312) + 14:22:21.295106 B > A: . ack 5289543 + 14:22:21.296306 A > B: . 5414591:5418903(4312) + 14:22:21.296878 A > B: . 5418903:5423215(4312) + 14:22:21.296917 B > A: . ack 5298167 + 14:22:21.297716 A > B: . 5423215:5427527(4312) + 14:22:21.298285 A > B: . 5427527:5431839(4312) + 14:22:21.298324 B > A: . ack 5306791 + 14:22:21.299413 A > B: . 5431839:5436151(4312) + 14:22:21.299986 A > B: . 5436151:5440463(4312) + 14:22:21.303696 B > A: . ack 5315415 + 14:22:21.305177 A > B: . 5440463:5444775(4312) + 14:22:21.305755 A > B: . 5444775:5449087(4312) + 14:22:21.308032 B > A: . ack 5324039 + 14:22:21.309525 A > B: . 5449087:5453399(4312) + 14:22:21.310101 A > B: . 5453399:5457711(4312) + 14:22:21.310144 B > A: . ack 5332663 *** + + 14:22:21.311615 A > B: . 5457711:5462023(4312) + 14:22:21.312198 A > B: . 5462023:5466335(4312) + 14:22:21.341876 B > A: . ack 5341287 + 14:22:21.343451 A > B: . 5466335:5470647(4312) + 14:22:21.343985 A > B: . 5470647:5474959(4312) + 14:22:21.350304 B > A: . ack 5349911 + 14:22:21.351852 A > B: . 5474959:5479271(4312) + 14:22:21.352430 A > B: . 5479271:5483583(4312) + 14:22:21.352484 B > A: . ack 5358535 + 14:22:21.353574 A > B: . 5483583:5487895(4312) + 14:22:21.354149 A > B: . 5487895:5492207(4312) + 14:22:21.354205 B > A: . ack 5367159 + 14:22:21.355467 A > B: . 5492207:5496519(4312) + 14:22:21.356039 A > B: . 5496519:5500831(4312) + 14:22:21.357361 B > A: . ack 5375783 + 14:22:21.358855 A > B: . 5500831:5505143(4312) + 14:22:21.359424 A > B: . 5505143:5509455(4312) + 14:22:21.359465 B > A: . ack 5384407 + + + +Paxson, et. al. Informational [Page 21] + +RFC 2525 TCP Implementation Problems March 1999 + + + 14:22:21.360605 A > B: . 5509455:5513767(4312) + 14:22:21.361181 A > B: . 5513767:5518079(4312) + 14:22:21.361225 B > A: . ack 5393031 + 14:22:21.362485 A > B: . 5518079:5522391(4312) + 14:22:21.363057 A > B: . 5522391:5526703(4312) + 14:22:21.363096 B > A: . ack 5401655 + 14:22:21.364236 A > B: . 5526703:5531015(4312) + 14:22:21.364810 A > B: . 5531015:5535327(4312) + 14:22:21.364867 B > A: . ack 5410279 + 14:22:21.365819 A > B: . 5535327:5539639(4312) + 14:22:21.366386 A > B: . 5539639:5543951(4312) + 14:22:21.366427 B > A: . ack 5418903 + 14:22:21.367586 A > B: . 5543951:5548263(4312) + 14:22:21.368158 A > B: . 5548263:5552575(4312) + 14:22:21.368199 B > A: . ack 5427527 + 14:22:21.369189 A > B: . 5552575:5556887(4312) + 14:22:21.369758 A > B: . 5556887:5561199(4312) + 14:22:21.369803 B > A: . ack 5436151 + 14:22:21.370814 A > B: . 5561199:5565511(4312) + 14:22:21.371398 A > B: . 5565511:5569823(4312) + 14:22:21.375159 B > A: . ack 5444775 + 14:22:21.376658 A > B: . 5569823:5574135(4312) + 14:22:21.377235 A > B: . 5574135:5578447(4312) + 14:22:21.379303 B > A: . ack 5453399 + 14:22:21.380802 A > B: . 5578447:5582759(4312) + 14:22:21.381377 A > B: . 5582759:5587071(4312) + 14:22:21.381947 A > B: . 5587071:5591383(4312) **** + + "***" marks the end of the first round trip. Note that cwnd did + not increase (as evidenced by each ACK eliciting two new data + packets). Only at "****", which comes near the end of the second + round trip, does cwnd increase by one packet. + + This trace did not suffer any timeout retransmissions. It + transferred the same amount of data as the first trace in about + half as much time. This difference is repeatable between hosts A + and B. + + References + [Stevens94] and [Wright95] discuss this problem. The problem of + Reno TCP failing to recover from multiple losses except via a + retransmission timeout is discussed in [Fall96,Hoe96]. + + + + + + + + + +Paxson, et. al. Informational [Page 22] + +RFC 2525 TCP Implementation Problems March 1999 + + + How to detect + If source code is available, that is generally the easiest way to + detect this problem. Search for each modification to the cwnd + variable; (at least) one of these will be for congestion + avoidance, and inspection of the related code should immediately + identify the problem if present. + + The problem can also be detected by closely examining packet + traces taken near the sender. During congestion avoidance, cwnd + will increase by an additional segment upon the receipt of + (typically) eight acknowledgements without a loss. This increase + is in addition to the one segment increase per round trip time (or + two round trip times if the receiver is using delayed ACKs). + + Furthermore, graphs of the sequence number vs. time, taken from + packet traces, are normally linear during congestion avoidance. + When viewing packet traces of transfers from senders exhibiting + this problem, the graphs appear quadratic instead of linear. + + Finally, the traces will show that, with sufficiently large + windows, nearly every loss event results in a timeout. + + How to fix + This problem may be corrected by removing the "+ MSS/8" term from + the congestion avoidance code that increases cwnd each time an ACK + of new data is received. + +2.7. + + Name of Problem + Initial RTO too low + + Classification + Performance + + Description + When a TCP first begins transmitting data, it lacks the RTT + measurements necessary to have computed an adaptive retransmission + timeout (RTO). RFC 1122, 4.2.3.1, states that a TCP SHOULD + initialize RTO to 3 seconds. A TCP that uses a lower value + exhibits "Initial RTO too low". + + Significance + In environments with large RTTs (where "large" means any value + larger than the initial RTO), TCPs will experience very poor + performance. + + + + + +Paxson, et. al. Informational [Page 23] + +RFC 2525 TCP Implementation Problems March 1999 + + + Implications + Whenever RTO < RTT, very poor performance can result as packets + are unnecessarily retransmitted (because RTO will expire before an + ACK for the packet can arrive) and the connection enters slow + start and congestion avoidance. Generally, the algorithms for + computing RTO avoid this problem by adding a positive term to the + estimated RTT. However, when a connection first begins it must + use some estimate for RTO, and if it picks a value less than RTT, + the above problems will arise. + + Furthermore, when the initial RTO < RTT, it can take a long time + for the TCP to correct the problem by adapting the RTT estimate, + because the use of Karn's algorithm (mandated by RFC 1122, + 4.2.3.1) will discard many of the candidate RTT measurements made + after the first timeout, since they will be measurements of + retransmitted segments. + + Relevant RFCs + RFC 1122 states that TCPs SHOULD initialize RTO to 3 seconds and + MUST implement Karn's algorithm. + + Trace file demonstrating it + The following trace file was taken using tcpdump at host A, the + data sender. The advertised window and SYN options have been + omitted for clarity. + + 07:52:39.870301 A > B: S 2786333696:2786333696(0) + 07:52:40.548170 B > A: S 130240000:130240000(0) ack 2786333697 + 07:52:40.561287 A > B: P 1:513(512) ack 1 + 07:52:40.753466 A > B: . 1:513(512) ack 1 + 07:52:41.133687 A > B: . 1:513(512) ack 1 + 07:52:41.458529 B > A: . ack 513 + 07:52:41.458686 A > B: . 513:1025(512) ack 1 + 07:52:41.458797 A > B: P 1025:1537(512) ack 1 + 07:52:41.541633 B > A: . ack 513 + 07:52:41.703732 A > B: . 513:1025(512) ack 1 + 07:52:42.044875 B > A: . ack 513 + 07:52:42.173728 A > B: . 513:1025(512) ack 1 + 07:52:42.330861 B > A: . ack 1537 + 07:52:42.331129 A > B: . 1537:2049(512) ack 1 + 07:52:42.331262 A > B: P 2049:2561(512) ack 1 + 07:52:42.623673 A > B: . 1537:2049(512) ack 1 + 07:52:42.683203 B > A: . ack 1537 + 07:52:43.044029 B > A: . ack 1537 + 07:52:43.193812 A > B: . 1537:2049(512) ack 1 + + + + + + +Paxson, et. al. Informational [Page 24] + +RFC 2525 TCP Implementation Problems March 1999 + + + Note from the SYN/SYN-ACK exchange, the RTT is over 600 msec. + However, from the elapsed time between the third and fourth lines + (the first packet being sent and then retransmitted), it is + apparent the RTO was initialized to under 200 msec. The next line + shows that this value has doubled to 400 msec (correct exponential + backoff of RTO), but that still does not suffice to avoid an + unnecessary retransmission. + + Finally, an ACK from B arrives for the first segment. Later two + more duplicate ACKs for 513 arrive, indicating that both the + original and the two retransmissions arrived at B. (Indeed, a + concurrent trace at B showed that no packets were lost during the + entire connection). This ACK opens the congestion window to two + packets, which are sent back-to-back, but at 07:52:41.703732 RTO + again expires after a little over 200 msec, leading to an + unnecessary retransmission, and the pattern repeats. By the end + of the trace excerpt above, 1536 bytes have been successfully + transmitted from A to B, over an interval of more than 2 seconds, + reflecting terrible performance. + + Trace file demonstrating correct behavior + The following trace file was taken using tcpdump at host C, the + data sender. The advertised window and SYN options have been + omitted for clarity. + + 17:30:32.090299 C > D: S 2031744000:2031744000(0) + 17:30:32.900325 D > C: S 262737964:262737964(0) ack 2031744001 + 17:30:32.900326 C > D: . ack 1 + 17:30:32.910326 C > D: . 1:513(512) ack 1 + 17:30:34.150355 D > C: . ack 513 + 17:30:34.150356 C > D: . 513:1025(512) ack 1 + 17:30:34.150357 C > D: . 1025:1537(512) ack 1 + 17:30:35.170384 D > C: . ack 1025 + 17:30:35.170385 C > D: . 1537:2049(512) ack 1 + 17:30:35.170386 C > D: . 2049:2561(512) ack 1 + 17:30:35.320385 D > C: . ack 1537 + 17:30:35.320386 C > D: . 2561:3073(512) ack 1 + 17:30:35.320387 C > D: . 3073:3585(512) ack 1 + 17:30:35.730384 D > C: . ack 2049 + + The initial SYN/SYN-ACK exchange shows that RTT is more than 800 + msec, and for some subsequent packets it rises above 1 second, but + C's retransmit timer does not ever expire. + + References + This problem is documented in [Paxson97]. + + + + + +Paxson, et. al. Informational [Page 25] + +RFC 2525 TCP Implementation Problems March 1999 + + + How to detect + This problem is readily detected by inspecting a packet trace of + the startup of a TCP connection made over a long-delay path. It + can be diagnosed from either a sender-side or receiver-side trace. + Long-delay paths can often be found by locating remote sites on + other continents. + + How to fix + As this problem arises from a faulty initialization, one hopes + fixing it requires a one-line change to the TCP source code. + +2.8. + + Name of Problem + Failure of window deflation after loss recovery + + Classification + Congestion control / performance + + Description + The fast recovery algorithm allows TCP senders to continue to + transmit new segments during loss recovery. First, fast + retransmission is initiated after a TCP sender receives three + duplicate ACKs. At this point, a retransmission is sent and cwnd + is halved. The fast recovery algorithm then allows additional + segments to be sent when sufficient additional duplicate ACKs + arrive. Some implementations of fast recovery compute when to + send additional segments by artificially incrementing cwnd, first + by three segments to account for the three duplicate ACKs that + triggered fast retransmission, and subsequently by 1 MSS for each + new duplicate ACK that arrives. When cwnd allows, the sender + transmits new data segments. + + When an ACK arrives that covers new data, cwnd is to be reduced by + the amount by which it was artificially increased. However, some + TCP implementations fail to "deflate" the window, causing an + inappropriate amount of data to be sent into the network after + recovery. One cause of this problem is the "header prediction" + code, which is used to handle incoming segments that require + little work. In some implementations of TCP, the header + prediction code does not check to make sure cwnd has not been + artificially inflated, and therefore does not reduce the + artificially increased cwnd when appropriate. + + Significance + TCP senders that exhibit this problem will transmit a burst of + data immediately after recovery, which can degrade performance, as + well as network stability. Effectively, the sender does not + + + +Paxson, et. al. Informational [Page 26] + +RFC 2525 TCP Implementation Problems March 1999 + + + reduce the size of cwnd as much as it should (to half its value + when loss was detected), if at all. This can harm the performance + of the TCP connection itself, as well as competing TCP flows. + + Implications + A TCP sender exhibiting this problem does not reduce cwnd + appropriately in times of congestion, and therefore may contribute + to congestive collapse. + + Relevant RFCs + RFC 2001 outlines the fast retransmit/fast recovery algorithms. + [Brakmo95] outlines this implementation problem and offers a fix. + + Trace file demonstrating it + The following trace file was taken using tcpdump at host A, the + data sender. The advertised window (which never changed) has been + omitted for clarity, except for the first packet sent by each + host. + + 08:22:56.825635 A.7505 > B.7505: . 29697:30209(512) ack 1 win 4608 + 08:22:57.038794 B.7505 > A.7505: . ack 27649 win 4096 + 08:22:57.039279 A.7505 > B.7505: . 30209:30721(512) ack 1 + 08:22:57.321876 B.7505 > A.7505: . ack 28161 + 08:22:57.322356 A.7505 > B.7505: . 30721:31233(512) ack 1 + 08:22:57.347128 B.7505 > A.7505: . ack 28673 + 08:22:57.347572 A.7505 > B.7505: . 31233:31745(512) ack 1 + 08:22:57.347782 A.7505 > B.7505: . 31745:32257(512) ack 1 + 08:22:57.936393 B.7505 > A.7505: . ack 29185 + 08:22:57.936864 A.7505 > B.7505: . 32257:32769(512) ack 1 + 08:22:57.950802 B.7505 > A.7505: . ack 29697 win 4096 + 08:22:57.951246 A.7505 > B.7505: . 32769:33281(512) ack 1 + 08:22:58.169422 B.7505 > A.7505: . ack 29697 + 08:22:58.638222 B.7505 > A.7505: . ack 29697 + 08:22:58.643312 B.7505 > A.7505: . ack 29697 + 08:22:58.643669 A.7505 > B.7505: . 29697:30209(512) ack 1 + 08:22:58.936436 B.7505 > A.7505: . ack 29697 + 08:22:59.002614 B.7505 > A.7505: . ack 29697 + 08:22:59.003026 A.7505 > B.7505: . 33281:33793(512) ack 1 + 08:22:59.682902 B.7505 > A.7505: . ack 33281 + 08:22:59.683391 A.7505 > B.7505: P 33793:34305(512) ack 1 + 08:22:59.683748 A.7505 > B.7505: P 34305:34817(512) ack 1 *** + 08:22:59.684043 A.7505 > B.7505: P 34817:35329(512) ack 1 + 08:22:59.684266 A.7505 > B.7505: P 35329:35841(512) ack 1 + 08:22:59.684567 A.7505 > B.7505: P 35841:36353(512) ack 1 + 08:22:59.684810 A.7505 > B.7505: P 36353:36865(512) ack 1 + 08:22:59.685094 A.7505 > B.7505: P 36865:37377(512) ack 1 + + + + + +Paxson, et. al. Informational [Page 27] + +RFC 2525 TCP Implementation Problems March 1999 + + + The first 12 lines of the trace show incoming ACKs clocking out a + window of data segments. At this point in the transfer, cwnd is 7 + segments. The next 4 lines of the trace show 3 duplicate ACKs + arriving from the receiver, followed by a retransmission from the + sender. At this point, cwnd is halved (to 3 segments) and + artificially incremented by the three duplicate ACKs that have + arrived, making cwnd 6 segments. The next two lines show 2 more + duplicate ACKs arriving, each of which increases cwnd by 1 + segment. So, after these two duplicate ACKs arrive the cwnd is 8 + segments and the sender has permission to send 1 new segment + (since there are 7 segments outstanding). The next line in the + trace shows this new segment being transmitted. The next packet + shown in the trace is an ACK from host B that covers the first 7 + outstanding segments (all but the new segment sent during + recovery). This should cause cwnd to be reduced to 3 segments and + 2 segments to be transmitted (since there is already 1 outstanding + segment in the network). However, as shown by the last 7 lines of + the trace, cwnd is not reduced, causing a line-rate burst of 7 new + segments. + + Trace file demonstrating correct behavior + The trace would appear identical to the one above, only it would + stop after the line marked "***", because at this point host A + would correctly reduce cwnd after recovery, allowing only 2 + segments to be transmitted, rather than producing a burst of 7 + segments. + + References + This problem is documented and the performance implications + analyzed in [Brakmo95]. + + How to detect + Failure of window deflation after loss recovery can be found by + examining sender-side packet traces recorded during periods of + moderate loss (so cwnd can grow large enough to allow for fast + recovery when loss occurs). + + How to fix + When this bug is caused by incorrect header prediction, the fix is + to add a predicate to the header prediction test that checks to + see whether cwnd is inflated; if so, the header prediction test + fails and the usual ACK processing occurs, which (in this case) + takes care to deflate the window. See [Brakmo95] for details. + +2.9. + + Name of Problem + Excessively short keepalive connection timeout + + + +Paxson, et. al. Informational [Page 28] + +RFC 2525 TCP Implementation Problems March 1999 + + + Classification + Reliability + + Description + Keep-alive is a mechanism for checking whether an idle connection + is still alive. According to RFC 1122, keepalive should only be + invoked in server applications that might otherwise hang + indefinitely and consume resources unnecessarily if a client + crashes or aborts a connection during a network failure. + + RFC 1122 also specifies that if a keep-alive mechanism is + implemented it MUST NOT interpret failure to respond to any + specific probe as a dead connection. The RFC does not specify a + particular mechanism for timing out a connection when no response + is received for keepalive probes. However, if the mechanism does + not allow ample time for recovery from network congestion or + delay, connections may be timed out unnecessarily. + + Significance + In congested networks, can lead to unwarranted termination of + connections. + + Implications + It is possible for the network connection between two peer + machines to become congested or to exhibit packet loss at the time + that a keep-alive probe is sent on a connection. If the keep- + alive mechanism does not allow sufficient time before dropping + connections in the face of unacknowledged probes, connections may + be dropped even when both peers of a connection are still alive. + + Relevant RFCs + RFC 1122 specifies that the keep-alive mechanism may be provided. + It does not specify a mechanism for determining dead connections + when keepalive probes are not acknowledged. + + Trace file demonstrating it + Made using the Orchestra tool at the peer of the machine using + keep-alive. After connection establishment, incoming keep-alives + were dropped by Orchestra to simulate a dead connection. + + 22:11:12.040000 A > B: 22666019:0 win 8192 datasz 4 SYN + 22:11:12.060000 B > A: 2496001:22666020 win 4096 datasz 4 SYN ACK + 22:11:12.130000 A > B: 22666020:2496002 win 8760 datasz 0 ACK + (more than two hours elapse) + 00:23:00.680000 A > B: 22666019:2496002 win 8760 datasz 1 ACK + 00:23:01.770000 A > B: 22666019:2496002 win 8760 datasz 1 ACK + 00:23:02.870000 A > B: 22666019:2496002 win 8760 datasz 1 ACK + 00:23.03.970000 A > B: 22666019:2496002 win 8760 datasz 1 ACK + + + +Paxson, et. al. Informational [Page 29] + +RFC 2525 TCP Implementation Problems March 1999 + + + 00:23.05.070000 A > B: 22666019:2496002 win 8760 datasz 1 ACK + + The initial three packets are the SYN exchange for connection + setup. About two hours later, the keepalive timer fires because + the connection has been idle. Keepalive probes are transmitted a + total of 5 times, with a 1 second spacing between probes, after + which the connection is dropped. This is problematic because a 5 + second network outage at the time of the first probe results in + the connection being killed. + + Trace file demonstrating correct behavior + Made using the Orchestra tool at the peer of the machine using + keep-alive. After connection establishment, incoming keep-alives + were dropped by Orchestra to simulate a dead connection. + + 16:01:52.130000 A > B: 1804412929:0 win 4096 datasz 4 SYN + 16:01:52.360000 B > A: 16512001:1804412930 win 4096 datasz 4 SYN ACK + 16:01:52.410000 A > B: 1804412930:16512002 win 4096 datasz 0 ACK + (two hours elapse) + 18:01:57.170000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:03:12.220000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:04:27.270000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:05:42.320000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:06:57.370000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:08:12.420000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:09:27.480000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:10:43.290000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:11:57.580000 A > B: 1804412929:16512002 win 4096 datasz 0 ACK + 18:13:12.630000 A > B: 1804412929:16512002 win 4096 datasz 0 RST ACK + + In this trace, when the keep-alive timer expires, 9 keepalive + probes are sent at 75 second intervals. 75 seconds after the last + probe is sent, a final RST segment is sent indicating that the + connection has been closed. This implementation waits about 11 + minutes before timing out the connection, while the first + implementation shown allows only 5 seconds. + + References + This problem is documented in [Dawson97]. + + How to detect + For implementations manifesting this problem, it shows up on a + packet trace after the keepalive timer fires if the peer machine + receiving the keepalive does not respond. Usually the keepalive + timer will fire at least two hours after keepalive is turned on, + but it may be sooner if the timer value has been configured lower, + or if the keepalive mechanism violates the specification (see + Insufficient interval between keepalives problem). In this + + + +Paxson, et. al. Informational [Page 30] + +RFC 2525 TCP Implementation Problems March 1999 + + + example, suppressing the response of the peer to keepalive probes + was accomplished using the Orchestra toolkit, which can be + configured to drop packets. It could also have been done by + creating a connection, turning on keepalive, and disconnecting the + network connection at the receiver machine. + + How to fix + This problem can be fixed by using a different method for timing + out keepalives that allows a longer period of time to elapse + before dropping the connection. For example, the algorithm for + timing out on dropped data could be used. Another possibility is + an algorithm such as the one shown in the trace above, which sends + 9 probes at 75 second intervals and then waits an additional 75 + seconds for a response before closing the connection. + +2.10. + + Name of Problem + Failure to back off retransmission timeout + + Classification + Congestion control / reliability + + Description + The retransmission timeout is used to determine when a packet has + been dropped in the network. When this timeout has expired + without the arrival of an ACK, the segment is retransmitted. Each + time a segment is retransmitted, the timeout is adjusted according + to an exponential backoff algorithm, doubling each time. If a TCP + fails to receive an ACK after numerous attempts at retransmitting + the same segment, it terminates the connection. A TCP that fails + to double its retransmission timeout upon repeated timeouts is + said to exhibit "Failure to back off retransmission timeout". + + Significance + Backing off the retransmission timer is a cornerstone of network + stability in the presence of congestion. Consequently, this bug + can have severe adverse affects in congested networks. It also + affects TCP reliability in congested networks, as discussed in the + next section. + + Implications + It is possible for the network connection between two TCP peers to + become congested or to exhibit packet loss at the time that a + retransmission is sent on a connection. If the retransmission + mechanism does not allow sufficient time before dropping + + + + + +Paxson, et. al. Informational [Page 31] + +RFC 2525 TCP Implementation Problems March 1999 + + + connections in the face of unacknowledged segments, connections + may be dropped even when, by waiting longer, the connection could + have continued. + + Relevant RFCs + RFC 1122 specifies mandatory exponential backoff of the + retransmission timeout, and the termination of connections after + some period of time (at least 100 seconds). + + Trace file demonstrating it + Made using tcpdump on an intermediate host: + + 16:51:12.671727 A > B: S 510878852:510878852(0) win 16384 + 16:51:12.672479 B > A: S 2392143687:2392143687(0) + ack 510878853 win 16384 + 16:51:12.672581 A > B: . ack 1 win 16384 + 16:51:15.244171 A > B: P 1:3(2) ack 1 win 16384 + 16:51:15.244933 B > A: . ack 3 win 17518 (DF) + + + + 16:51:19.381176 A > B: P 3:5(2) ack 1 win 16384 + 16:51:20.162016 A > B: P 3:5(2) ack 1 win 16384 + 16:51:21.161936 A > B: P 3:5(2) ack 1 win 16384 + 16:51:22.161914 A > B: P 3:5(2) ack 1 win 16384 + 16:51:23.161914 A > B: P 3:5(2) ack 1 win 16384 + 16:51:24.161879 A > B: P 3:5(2) ack 1 win 16384 + 16:51:25.161857 A > B: P 3:5(2) ack 1 win 16384 + 16:51:26.161836 A > B: P 3:5(2) ack 1 win 16384 + 16:51:27.161814 A > B: P 3:5(2) ack 1 win 16384 + 16:51:28.161791 A > B: P 3:5(2) ack 1 win 16384 + 16:51:29.161769 A > B: P 3:5(2) ack 1 win 16384 + 16:51:30.161750 A > B: P 3:5(2) ack 1 win 16384 + 16:51:31.161727 A > B: P 3:5(2) ack 1 win 16384 + + 16:51:32.161701 A > B: R 5:5(0) ack 1 win 16384 + + The initial three packets are the SYN exchange for connection + setup, then a single data packet, to verify that data can be + transferred. Then the connection to the destination host was + disconnected, and more data sent. Retransmissions occur every + second for 12 seconds, and then the connection is terminated with + a RST. This is problematic because a 12 second pause in + connectivity could result in the termination of a connection. + + Trace file demonstrating correct behavior + Again, a tcpdump taken from a third host: + + + + +Paxson, et. al. Informational [Page 32] + +RFC 2525 TCP Implementation Problems March 1999 + + + 16:59:05.398301 A > B: S 2503324757:2503324757(0) win 16384 + 16:59:05.399673 B > A: S 2492674648:2492674648(0) + ack 2503324758 win 16384 + 16:59:05.399866 A > B: . ack 1 win 17520 + 16:59:06.538107 A > B: P 1:3(2) ack 1 win 17520 + 16:59:06.540977 B > A: . ack 3 win 17518 (DF) + + + + 16:59:13.121542 A > B: P 3:5(2) ack 1 win 17520 + 16:59:14.010928 A > B: P 3:5(2) ack 1 win 17520 + 16:59:16.010979 A > B: P 3:5(2) ack 1 win 17520 + 16:59:20.011229 A > B: P 3:5(2) ack 1 win 17520 + 16:59:28.011896 A > B: P 3:5(2) ack 1 win 17520 + 16:59:44.013200 A > B: P 3:5(2) ack 1 win 17520 + 17:00:16.015766 A > B: P 3:5(2) ack 1 win 17520 + 17:01:20.021308 A > B: P 3:5(2) ack 1 win 17520 + 17:02:24.027752 A > B: P 3:5(2) ack 1 win 17520 + 17:03:28.034569 A > B: P 3:5(2) ack 1 win 17520 + 17:04:32.041567 A > B: P 3:5(2) ack 1 win 17520 + 17:05:36.048264 A > B: P 3:5(2) ack 1 win 17520 + 17:06:40.054900 A > B: P 3:5(2) ack 1 win 17520 + + 17:07:44.061306 A > B: R 5:5(0) ack 1 win 17520 + + In this trace, when the retransmission timer expires, 12 + retransmissions are sent at exponentially-increasing intervals, + until the interval value reaches 64 seconds, at which time the + interval stops growing. 64 seconds after the last retransmission, + a final RST segment is sent indicating that the connection has + been closed. This implementation waits about 9 minutes before + timing out the connection, while the first implementation shown + allows only 12 seconds. + + References + None known. + + How to detect + A simple transfer can be easily interrupted by disconnecting the + receiving host from the network. tcpdump or another appropriate + tool should show the retransmissions being sent. Several trials + in a low-rtt environment may be required to demonstrate the bug. + + How to fix + For one of the implementations studied, this problem seemed to be + the result of an error introduced with the addition of the + Brakmo-Peterson RTO algorithm [Brakmo95], which can return a value + of zero where the older Jacobson algorithm always returns a + + + +Paxson, et. al. Informational [Page 33] + +RFC 2525 TCP Implementation Problems March 1999 + + + positive value. Brakmo and Peterson specified an additional step + of min(rtt + 2, RTO) to avoid problems with this. Unfortunately, + in the implementation this step was omitted when calculating the + exponential backoff for the RTO. This results in an RTO of 0 + seconds being multiplied by the backoff, yielding again zero, and + then being subjected to a later MAX operation that increases it to + 1 second, regardless of the backoff factor. + + A similar TCP persist failure has the same cause. + +2.11. + + Name of Problem + Insufficient interval between keepalives + + Classification + Reliability + + Description + Keep-alive is a mechanism for checking whether an idle connection + is still alive. According to RFC 1122, keep-alive may be included + in an implementation. If it is included, the interval between + keep-alive packets MUST be configurable, and MUST default to no + less than two hours. + + Significance + In congested networks, can lead to unwarranted termination of + connections. + + Implications + According to RFC 1122, keep-alive is not required of + implementations because it could: (1) cause perfectly good + connections to break during transient Internet failures; (2) + consume unnecessary bandwidth ("if no one is using the connection, + who cares if it is still good?"); and (3) cost money for an + Internet path that charges for packets. Regarding this last + point, we note that in addition the presence of dial-on-demand + links in the route can greatly magnify the cost penalty of excess + keepalives, potentially forcing a full-time connection on a link + that would otherwise only be connected a few minutes a day. + + If keepalive is provided the RFC states that the required inter- + keepalive distance MUST default to no less than two hours. If it + does not, the probability of connections breaking increases, the + bandwidth used due to keepalives increases, and cost increases + over paths which charge per packet. + + + + + +Paxson, et. al. Informational [Page 34] + +RFC 2525 TCP Implementation Problems March 1999 + + + Relevant RFCs + RFC 1122 specifies that the keep-alive mechanism may be provided. + It also specifies the two hour minimum for the default interval + between keepalive probes. + + Trace file demonstrating it + Made using the Orchestra tool at the peer of the machine using + keep-alive. Machine A was configured to use default settings for + the keepalive timer. + + 11:36:32.910000 A > B: 3288354305:0 win 28672 datasz 4 SYN + 11:36:32.930000 B > A: 896001:3288354306 win 4096 datasz 4 SYN ACK + 11:36:32.950000 A > B: 3288354306:896002 win 28672 datasz 0 ACK + + 11:50:01.190000 A > B: 3288354305:896002 win 28672 datasz 0 ACK + 11:50:01.210000 B > A: 896002:3288354306 win 4096 datasz 0 ACK + + 12:03:29.410000 A > B: 3288354305:896002 win 28672 datasz 0 ACK + 12:03:29.430000 B > A: 896002:3288354306 win 4096 datasz 0 ACK + + 12:16:57.630000 A > B: 3288354305:896002 win 28672 datasz 0 ACK + 12:16:57.650000 B > A: 896002:3288354306 win 4096 datasz 0 ACK + + 12:30:25.850000 A > B: 3288354305:896002 win 28672 datasz 0 ACK + 12:30:25.870000 B > A: 896002:3288354306 win 4096 datasz 0 ACK + + 12:43:54.070000 A > B: 3288354305:896002 win 28672 datasz 0 ACK + 12:43:54.090000 B > A: 896002:3288354306 win 4096 datasz 0 ACK + + The initial three packets are the SYN exchange for connection + setup. About 13 minutes later, the keepalive timer fires because + the connection is idle. The keepalive is acknowledged, and the + timer fires again in about 13 more minutes. This behavior + continues indefinitely until the connection is closed, and is a + violation of the specification. + + Trace file demonstrating correct behavior + Made using the Orchestra tool at the peer of the machine using + keep-alive. Machine A was configured to use default settings for + the keepalive timer. + + 17:37:20.500000 A > B: 34155521:0 win 4096 datasz 4 SYN + 17:37:20.520000 B > A: 6272001:34155522 win 4096 datasz 4 SYN ACK + 17:37:20.540000 A > B: 34155522:6272002 win 4096 datasz 0 ACK + + 19:37:25.430000 A > B: 34155521:6272002 win 4096 datasz 0 ACK + 19:37:25.450000 B > A: 6272002:34155522 win 4096 datasz 0 ACK + + + + +Paxson, et. al. Informational [Page 35] + +RFC 2525 TCP Implementation Problems March 1999 + + + 21:37:30.560000 A > B: 34155521:6272002 win 4096 datasz 0 ACK + 21:37:30.570000 B > A: 6272002:34155522 win 4096 datasz 0 ACK + + 23:37:35.580000 A > B: 34155521:6272002 win 4096 datasz 0 ACK + 23:37:35.600000 B > A: 6272002:34155522 win 4096 datasz 0 ACK + + 01:37:40.620000 A > B: 34155521:6272002 win 4096 datasz 0 ACK + 01:37:40.640000 B > A: 6272002:34155522 win 4096 datasz 0 ACK + + 03:37:45.590000 A > B: 34155521:6272002 win 4096 datasz 0 ACK + 03:37:45.610000 B > A: 6272002:34155522 win 4096 datasz 0 ACK + + The initial three packets are the SYN exchange for connection + setup. Just over two hours later, the keepalive timer fires + because the connection is idle. The keepalive is acknowledged, + and the timer fires again just over two hours later. This + behavior continues indefinitely until the connection is closed. + + References + This problem is documented in [Dawson97]. + + How to detect + For implementations manifesting this problem, it shows up on a + packet trace. If the connection is left idle, the keepalive + probes will arrive closer together than the two hour minimum. + +2.12. + + Name of Problem + Window probe deadlock + + Classification + Reliability + + Description + When an application reads a single byte from a full window, the + window should not be updated, in order to avoid Silly Window + Syndrome (SWS; see [RFC813]). If the remote peer uses a single + byte of data to probe the window, that byte can be accepted into + the buffer. In some implementations, at this point a negative + argument to a signed comparison causes all further new data to be + considered outside the window; consequently, it is discarded + (after sending an ACK to resynchronize). These discards include + the ACKs for the data packets sent by the local TCP, so the TCP + will consider the data unacknowledged. + + + + + + +Paxson, et. al. Informational [Page 36] + +RFC 2525 TCP Implementation Problems March 1999 + + + Consequently, the application may be unable to complete sending + new data to the remote peer, because it has exhausted the transmit + buffer available to its local TCP, and buffer space is never being + freed because incoming ACKs that would do so are being discarded. + If the application does not read any more data, which may happen + due to its failure to complete such sends, then deadlock results. + + Significance + It's relatively rare for applications to use TCP in a manner that + can exercise this problem. Most applications only transmit bulk + data if they know the other end is prepared to receive the data. + However, if a client fails to consume data, putting the server in + persist mode, and then consumes a small amount of data, it can + mistakenly compute a negative window. At this point the client + will discard all further packets from the server, including ACKs + of the client's own data, since they are not inside the + (impossibly-sized) window. If subsequently the client consumes + enough data to then send a window update to the server, the + situation will be rectified. That is, this situation can only + happen if the client consumes 1 < N < MSS bytes, so as not to + cause a window update, and then starts its own transmission + towards the server of more than a window's worth of data. + + Implications + TCP connections will hang and eventually time out. + + Relevant RFCs + RFC 793 describes zero window probing. RFC 813 describes Silly + Window Syndrome. + + Trace file demonstrating it + Trace made from a version of tcpdump modified to print out the + sequence number attached to an ACK even if it's dataless. An + unmodified tcpdump would not print seq:seq(0); however, for this + bug, the sequence number in the ACK is important for unambiguously + determining how the TCP is behaving. + + [ Normal connection startup and data transmission from B to A. + Options, including MSS of 16344 in both directions, omitted + for clarity. ] + 16:07:32.327616 A > B: S 65360807:65360807(0) win 8192 + 16:07:32.327304 B > A: S 65488807:65488807(0) ack 65360808 win 57344 + 16:07:32.327425 A > B: . 1:1(0) ack 1 win 57344 + 16:07:32.345732 B > A: P 1:2049(2048) ack 1 win 57344 + 16:07:32.347013 B > A: P 2049:16385(14336) ack 1 win 57344 + 16:07:32.347550 B > A: P 16385:30721(14336) ack 1 win 57344 + 16:07:32.348683 B > A: P 30721:45057(14336) ack 1 win 57344 + 16:07:32.467286 A > B: . 1:1(0) ack 45057 win 12288 + + + +Paxson, et. al. Informational [Page 37] + +RFC 2525 TCP Implementation Problems March 1999 + + + 16:07:32.467854 B > A: P 45057:57345(12288) ack 1 win 57344 + + [ B fills up A's offered window ] + 16:07:32.667276 A > B: . 1:1(0) ack 57345 win 0 + + [ B probes A's window with a single byte ] + 16:07:37.467438 B > A: . 57345:57346(1) ack 1 win 57344 + + [ A resynchronizes without accepting the byte ] + 16:07:37.467678 A > B: . 1:1(0) ack 57345 win 0 + + [ B probes A's window again ] + 16:07:45.467438 B > A: . 57345:57346(1) ack 1 win 57344 + + [ A resynchronizes and accepts the byte (per the ack field) ] + 16:07:45.667250 A > B: . 1:1(0) ack 57346 win 0 + + [ The application on A has started generating data. The first + packet A sends is small due to a memory allocation bug. ] + 16:07:51.358459 A > B: P 1:2049(2048) ack 57346 win 0 + + [ B acks A's first packet ] + 16:07:51.467239 B > A: . 57346:57346(0) ack 2049 win 57344 + + [ This looks as though A accepted B's ACK and is sending + another packet in response to it. In fact, A is trying + to resynchronize with B, and happens to have data to send + and can send it because the first small packet didn't use + up cwnd. ] + 16:07:51.467698 A > B: . 2049:14337(12288) ack 57346 win 0 + + [ B acks all of the data that A has sent ] + 16:07:51.667283 B > A: . 57346:57346(0) ack 14337 win 57344 + + [ A tries to resynchronize. Notice that by the packets + seen on the network, A and B *are* in fact synchronized; + A only thinks that they aren't. ] + 16:07:51.667477 A > B: . 14337:14337(0) ack 57346 win 0 + + [ A's retransmit timer fires, and B acks all of the data. + A once again tries to resynchronize. ] + 16:07:52.467682 A > B: . 1:14337(14336) ack 57346 win 0 + 16:07:52.468166 B > A: . 57346:57346(0) ack 14337 win 57344 + 16:07:52.468248 A > B: . 14337:14337(0) ack 57346 win 0 + + [ A's retransmit timer fires again, and B acks all of the data. + A once again tries to resynchronize. ] + 16:07:55.467684 A > B: . 1:14337(14336) ack 57346 win 0 + + + +Paxson, et. al. Informational [Page 38] + +RFC 2525 TCP Implementation Problems March 1999 + + + 16:07:55.468172 B > A: . 57346:57346(0) ack 14337 win 57344 + 16:07:55.468254 A > B: . 14337:14337(0) ack 57346 win 0 + + Trace file demonstrating correct behavior + Made between the same two hosts after applying the bug fix + mentioned below (and using the same modified tcpdump). + + [ Connection starts up with data transmission from B to A. + Note that due to a separate bug (the fact that A and B + are communicating over a loopback driver), B erroneously + skips slow start. ] + 17:38:09.510854 A > B: S 3110066585:3110066585(0) win 16384 + 17:38:09.510926 B > A: S 3110174850:3110174850(0) + ack 3110066586 win 57344 + 17:38:09.510953 A > B: . 1:1(0) ack 1 win 57344 + 17:38:09.512956 B > A: P 1:2049(2048) ack 1 win 57344 + 17:38:09.513222 B > A: P 2049:16385(14336) ack 1 win 57344 + 17:38:09.513428 B > A: P 16385:30721(14336) ack 1 win 57344 + 17:38:09.513638 B > A: P 30721:45057(14336) ack 1 win 57344 + 17:38:09.519531 A > B: . 1:1(0) ack 45057 win 12288 + 17:38:09.519638 B > A: P 45057:57345(12288) ack 1 win 57344 + + [ B fills up A's offered window ] + 17:38:09.719526 A > B: . 1:1(0) ack 57345 win 0 + + [ B probes A's window with a single byte. A resynchronizes + without accepting the byte ] + 17:38:14.499661 B > A: . 57345:57346(1) ack 1 win 57344 + 17:38:14.499724 A > B: . 1:1(0) ack 57345 win 0 + + [ B probes A's window again. A resynchronizes and accepts + the byte, as indicated by the ack field ] + 17:38:19.499764 B > A: . 57345:57346(1) ack 1 win 57344 + 17:38:19.519731 A > B: . 1:1(0) ack 57346 win 0 + + [ B probes A's window with a single byte. A resynchronizes + without accepting the byte ] + 17:38:24.499865 B > A: . 57346:57347(1) ack 1 win 57344 + 17:38:24.499934 A > B: . 1:1(0) ack 57346 win 0 + + [ The application on A has started generating data. + B acks A's data and A accepts the ACKs and the + data transfer continues ] + 17:38:28.530265 A > B: P 1:2049(2048) ack 57346 win 0 + 17:38:28.719914 B > A: . 57346:57346(0) ack 2049 win 57344 + + 17:38:28.720023 A > B: . 2049:16385(14336) ack 57346 win 0 + 17:38:28.720089 A > B: . 16385:30721(14336) ack 57346 win 0 + + + +Paxson, et. al. Informational [Page 39] + +RFC 2525 TCP Implementation Problems March 1999 + + + 17:38:28.720370 B > A: . 57346:57346(0) ack 30721 win 57344 + + 17:38:28.720462 A > B: . 30721:45057(14336) ack 57346 win 0 + 17:38:28.720526 A > B: P 45057:59393(14336) ack 57346 win 0 + 17:38:28.720824 A > B: P 59393:73729(14336) ack 57346 win 0 + 17:38:28.721124 B > A: . 57346:57346(0) ack 73729 win 47104 + + 17:38:28.721198 A > B: P 73729:88065(14336) ack 57346 win 0 + 17:38:28.721379 A > B: P 88065:102401(14336) ack 57346 win 0 + + 17:38:28.721557 A > B: P 102401:116737(14336) ack 57346 win 0 + 17:38:28.721863 B > A: . 57346:57346(0) ack 116737 win 36864 + + References + None known. + + How to detect + Initiate a connection from a client to a server. Have the server + continuously send data until its buffers have been full for long + enough to exhaust the window. Next, have the client read 1 byte + and then delay for long enough that the server TCP sends a window + probe. Now have the client start sending data. At this point, if + it ignores the server's ACKs, then the client's TCP suffers from + the problem. + + How to fix + In one implementation known to exhibit the problem (derived from + 4.3-Reno), the problem was introduced when the macro MAX() was + replaced by the function call max() for computing the amount of + space in the receive window: + + tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + + When data has been received into a window beyond what has been + advertised to the other side, rcv_nxt > rcv_adv, making this + negative. It's clear from the (int) cast that this is intended, + but the unsigned max() function sign-extends so the negative + number is "larger". The fix is to change max() to imax(): + + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + + 4.3-Tahoe and before did not have this bug, since it used the + macro MAX() for this calculation. + +2.13. + + Name of Problem + Stretch ACK violation + + + +Paxson, et. al. Informational [Page 40] + +RFC 2525 TCP Implementation Problems March 1999 + + + Classification + Congestion Control/Performance + + Description + To improve efficiency (both computer and network) a data receiver + may refrain from sending an ACK for each incoming segment, + according to [RFC1122]. However, an ACK should not be delayed an + inordinate amount of time. Specifically, ACKs SHOULD be sent for + every second full-sized segment that arrives. If a second full- + sized segment does not arrive within a given timeout (of no more + than 0.5 seconds), an ACK should be transmitted, according to + [RFC1122]. A TCP receiver which does not generate an ACK for + every second full-sized segment exhibits a "Stretch ACK + Violation". + + Significance + TCP receivers exhibiting this behavior will cause TCP senders to + generate burstier traffic, which can degrade performance in + congested environments. In addition, generating fewer ACKs + increases the amount of time needed by the slow start algorithm to + open the congestion window to an appropriate point, which + diminishes performance in environments with large bandwidth-delay + products. Finally, generating fewer ACKs may cause needless + retransmission timeouts in lossy environments, as it increases the + possibility that an entire window of ACKs is lost, forcing a + retransmission timeout. + + Implications + When not in loss recovery, every ACK received by a TCP sender + triggers the transmission of new data segments. The burst size is + determined by the number of previously unacknowledged segments + each ACK covers. Therefore, a TCP receiver ack'ing more than 2 + segments at a time causes the sending TCP to generate a larger + burst of traffic upon receipt of the ACK. This large burst of + traffic can overwhelm an intervening gateway, leading to higher + drop rates for both the connection and other connections passing + through the congested gateway. + + In addition, the TCP slow start algorithm increases the congestion + window by 1 segment for each ACK received. Therefore, increasing + the ACK interval (thus decreasing the rate at which ACKs are + transmitted) increases the amount of time it takes slow start to + increase the congestion window to an appropriate operating point, + and the connection consequently suffers from reduced performance. + This is especially true for connections using large windows. + + Relevant RFCs + RFC 1122 outlines delayed ACKs as a recommended mechanism. + + + +Paxson, et. al. Informational [Page 41] + +RFC 2525 TCP Implementation Problems March 1999 + + + Trace file demonstrating it + Trace file taken using tcpdump at host B, the data receiver (and + ACK originator). The advertised window (which never changed) and + timestamp options have been omitted for clarity, except for the + first packet sent by A: + + 12:09:24.820187 A.1174 > B.3999: . 2049:3497(1448) ack 1 + win 33580 [tos 0x8] + 12:09:24.824147 A.1174 > B.3999: . 3497:4945(1448) ack 1 + 12:09:24.832034 A.1174 > B.3999: . 4945:6393(1448) ack 1 + 12:09:24.832222 B.3999 > A.1174: . ack 6393 + 12:09:24.934837 A.1174 > B.3999: . 6393:7841(1448) ack 1 + 12:09:24.942721 A.1174 > B.3999: . 7841:9289(1448) ack 1 + 12:09:24.950605 A.1174 > B.3999: . 9289:10737(1448) ack 1 + 12:09:24.950797 B.3999 > A.1174: . ack 10737 + 12:09:24.958488 A.1174 > B.3999: . 10737:12185(1448) ack 1 + 12:09:25.052330 A.1174 > B.3999: . 12185:13633(1448) ack 1 + 12:09:25.060216 A.1174 > B.3999: . 13633:15081(1448) ack 1 + 12:09:25.060405 B.3999 > A.1174: . ack 15081 + + This portion of the trace clearly shows that the receiver (host B) + sends an ACK for every third full sized packet received. Further + investigation of this implementation found that the cause of the + increased ACK interval was the TCP options being used. The + implementation sent an ACK after it was holding 2*MSS worth of + unacknowledged data. In the above case, the MSS is 1460 bytes so + the receiver transmits an ACK after it is holding at least 2920 + bytes of unacknowledged data. However, the length of the TCP + options being used [RFC1323] took 12 bytes away from the data + portion of each packet. This produced packets containing 1448 + bytes of data. But the additional bytes used by the options in + the header were not taken into account when determining when to + trigger an ACK. Therefore, it took 3 data segments before the + data receiver was holding enough unacknowledged data (>= 2*MSS, or + 2920 bytes in the above example) to transmit an ACK. + + Trace file demonstrating correct behavior + Trace file taken using tcpdump at host B, the data receiver (and + ACK originator), again with window and timestamp information + omitted except for the first packet: + + 12:06:53.627320 A.1172 > B.3999: . 1449:2897(1448) ack 1 + win 33580 [tos 0x8] + 12:06:53.634773 A.1172 > B.3999: . 2897:4345(1448) ack 1 + 12:06:53.634961 B.3999 > A.1172: . ack 4345 + 12:06:53.737326 A.1172 > B.3999: . 4345:5793(1448) ack 1 + 12:06:53.744401 A.1172 > B.3999: . 5793:7241(1448) ack 1 + 12:06:53.744592 B.3999 > A.1172: . ack 7241 + + + +Paxson, et. al. Informational [Page 42] + +RFC 2525 TCP Implementation Problems March 1999 + + + 12:06:53.752287 A.1172 > B.3999: . 7241:8689(1448) ack 1 + 12:06:53.847332 A.1172 > B.3999: . 8689:10137(1448) ack 1 + 12:06:53.847525 B.3999 > A.1172: . ack 10137 + + This trace shows the TCP receiver (host B) ack'ing every second + full-sized packet, according to [RFC1122]. This is the same + implementation shown above, with slight modifications that allow + the receiver to take the length of the options into account when + deciding when to transmit an ACK. + + References + This problem is documented in [Allman97] and [Paxson97]. + + How to detect + Stretch ACK violations show up immediately in receiver-side packet + traces of bulk transfers, as shown above. However, packet traces + made on the sender side of the TCP connection may lead to + ambiguities when diagnosing this problem due to the possibility of + lost ACKs. + +2.14. + + Name of Problem + Retransmission sends multiple packets + + Classification + Congestion control + + Description + When a TCP retransmits a segment due to a timeout expiration or + beginning a fast retransmission sequence, it should only transmit + a single segment. A TCP that transmits more than one segment + exhibits "Retransmission Sends Multiple Packets". + + Instances of this problem have been known to occur due to + miscomputations involving the use of TCP options. TCP options + increase the TCP header beyond its usual size of 20 bytes. The + total size of header must be taken into account when + retransmitting a packet. If a TCP sender does not account for the + length of the TCP options when determining how much data to + retransmit, it will send too much data to fit into a single + packet. In this case, the correct retransmission will be followed + by a short segment (tinygram) containing data that may not need to + be retransmitted. + + A specific case is a TCP using the RFC 1323 timestamp option, + which adds 12 bytes to the standard 20-byte TCP header. On + retransmission of a packet, the 12 byte option is incorrectly + + + +Paxson, et. al. Informational [Page 43] + +RFC 2525 TCP Implementation Problems March 1999 + + + interpreted as part of the data portion of the segment. A + standard TCP header and a new 12-byte option is added to the data, + which yields a transmission of 12 bytes more data than contained + in the original segment. This overflow causes a smaller packet, + with 12 data bytes, to be transmitted. + + Significance + This problem is somewhat serious for congested environments + because the TCP implementation injects more packets into the + network than is appropriate. However, since a tinygram is only + sent in response to a fast retransmit or a timeout, it does not + effect the sustained sending rate. + + Implications + A TCP exhibiting this behavior is stressing the network with more + traffic than appropriate, and stressing routers by increasing the + number of packets they must process. The redundant tinygram will + also elicit a duplicate ACK from the receiver, resulting in yet + another unnecessary transmission. + + Relevant RFCs + RFC 1122 requires use of slow start after loss; RFC 2001 + explicates slow start; RFC 1323 describes the timestamp option + that has been observed to lead to some implementations exhibiting + this problem. + + Trace file demonstrating it + Made using tcpdump recording at a machine on the same subnet as + Host A. Host A is the sender and Host B is the receiver. The + advertised window and timestamp options have been omitted for + clarity, except for the first segment sent by host A. In + addition, portions of the trace file not pertaining to the packet + in question have been removed (missing packets are denoted by + "[...]" in the trace). + + 11:55:22.701668 A > B: . 7361:7821(460) ack 1 + win 49324 + 11:55:22.702109 A > B: . 7821:8281(460) ack 1 + [...] + + 11:55:23.112405 B > A: . ack 7821 + 11:55:23.113069 A > B: . 12421:12881(460) ack 1 + 11:55:23.113511 A > B: . 12881:13341(460) ack 1 + 11:55:23.333077 B > A: . ack 7821 + 11:55:23.336860 B > A: . ack 7821 + 11:55:23.340638 B > A: . ack 7821 + 11:55:23.341290 A > B: . 7821:8281(460) ack 1 + 11:55:23.341317 A > B: . 8281:8293(12) ack 1 + + + +Paxson, et. al. Informational [Page 44] + +RFC 2525 TCP Implementation Problems March 1999 + + + 11:55:23.498242 B > A: . ack 7821 + 11:55:23.506850 B > A: . ack 7821 + 11:55:23.510630 B > A: . ack 7821 + + [...] + + 11:55:23.746649 B > A: . ack 10581 + + The second line of the above trace shows the original transmission + of a segment which is later dropped. After 3 duplicate ACKs, line + 9 of the trace shows the dropped packet (7821:8281), with a 460- + byte payload, being retransmitted. Immediately following this + retransmission, a packet with a 12-byte payload is unnecessarily + sent. + + Trace file demonstrating correct behavior + The trace file would be identical to the one above, with a single + line: + + 11:55:23.341317 A > B: . 8281:8293(12) ack 1 + + omitted. + + References + [Brakmo95] + + How to detect + This problem can be detected by examining a packet trace of the + TCP connections of a machine using TCP options, during which a + packet is retransmitted. + +2.15. + + Name of Problem + Failure to send FIN notification promptly + + Classification + Performance + + Description + When an application closes a connection, the corresponding TCP + should send the FIN notification promptly to its peer (unless + prevented by the congestion window). If a TCP implementation + delays in sending the FIN notification, for example due to waiting + until unacknowledged data has been acknowledged, then it is said + to exhibit "Failure to send FIN notification promptly". + + + + + +Paxson, et. al. Informational [Page 45] + +RFC 2525 TCP Implementation Problems March 1999 + + + Also, while not strictly required, FIN segments should include the + PSH flag to ensure expedited delivery of any pending data at the + receiver. + + Significance + The greatest impact occurs for short-lived connections, since for + these the additional time required to close the connection + introduces the greatest relative delay. + + The additional time can be significant in the common case of the + sender waiting for an ACK that is delayed by the receiver. + + Implications + Can diminish total throughput as seen at the application layer, + because connection termination takes longer to complete. + + Relevant RFCs + RFC 793 indicates that a receiver should treat an incoming FIN + flag as implying the push function. + + Trace file demonstrating it + Made using tcpdump (no losses reported by the packet filter). + + 10:04:38.68 A > B: S 1031850376:1031850376(0) win 4096 + (DF) + 10:04:38.71 B > A: S 596916473:596916473(0) ack 1031850377 + win 8760 (DF) + 10:04:38.73 A > B: . ack 1 win 4096 (DF) + 10:04:41.98 A > B: P 1:4(3) ack 1 win 4096 (DF) + 10:04:42.15 B > A: . ack 4 win 8757 (DF) + 10:04:42.23 A > B: P 4:7(3) ack 1 win 4096 (DF) + 10:04:42.25 B > A: P 1:11(10) ack 7 win 8754 (DF) + 10:04:42.32 A > B: . ack 11 win 4096 (DF) + 10:04:42.33 B > A: P 11:51(40) ack 7 win 8754 (DF) + 10:04:42.51 A > B: . ack 51 win 4096 (DF) + 10:04:42.53 B > A: F 51:51(0) ack 7 win 8754 (DF) + 10:04:42.56 A > B: FP 7:7(0) ack 52 win 4096 (DF) + 10:04:42.58 B > A: . ack 8 win 8754 (DF) + + Machine B in the trace above does not send out a FIN notification + promptly if there is any data outstanding. It instead waits for + all unacknowledged data to be acknowledged before sending the FIN + segment. The connection was closed at 10:04.42.33 after + requesting 40 bytes to be sent. However, the FIN notification + isn't sent until 10:04.42.51, after the (delayed) acknowledgement + of the 40 bytes of data. + + + + + +Paxson, et. al. Informational [Page 46] + +RFC 2525 TCP Implementation Problems March 1999 + + + Trace file demonstrating correct behavior + Made using tcpdump (no losses reported by the packet filter). + + 10:27:53.85 C > D: S 419744533:419744533(0) win 4096 + (DF) + 10:27:53.92 D > C: S 10082297:10082297(0) ack 419744534 + win 8760 (DF) + 10:27:53.95 C > D: . ack 1 win 4096 (DF) + 10:27:54.42 C > D: P 1:4(3) ack 1 win 4096 (DF) + 10:27:54.62 D > C: . ack 4 win 8757 (DF) + 10:27:54.76 C > D: P 4:7(3) ack 1 win 4096 (DF) + 10:27:54.89 D > C: P 1:11(10) ack 7 win 8754 (DF) + 10:27:54.90 D > C: FP 11:51(40) ack7 win 8754 (DF) + 10:27:54.92 C > D: . ack 52 win 4096 (DF) + 10:27:55.01 C > D: FP 7:7(0) ack 52 win 4096 (DF) + 10:27:55.09 D > C: . ack 8 win 8754 (DF) + + Here, Machine D sends a FIN with 40 bytes of data even before the + original 10 octets have been acknowledged. This is correct + behavior as it provides for the highest performance. + + References + This problem is documented in [Dawson97]. + + How to detect + For implementations manifesting this problem, it shows up on a + packet trace. + +2.16. + + Name of Problem + Failure to send a RST after Half Duplex Close + + Classification + Resource management + + Description + RFC 1122 4.2.2.13 states that a TCP SHOULD send a RST if data is + received after "half duplex close", i.e. if it cannot be delivered + to the application. A TCP that fails to do so is said to exhibit + "Failure to send a RST after Half Duplex Close". + + Significance + Potentially serious for TCP endpoints that manage large numbers of + connections, due to exhaustion of memory and/or process slots + available for managing connection state. + + + + + +Paxson, et. al. Informational [Page 47] + +RFC 2525 TCP Implementation Problems March 1999 + + + Implications + Failure to send the RST can lead to permanently hung TCP + connections. This problem has been demonstrated when HTTP clients + abort connections, common when users move on to a new page before + the current page has finished downloading. The HTTP client closes + by transmitting a FIN while the server is transmitting images, + text, etc. The server TCP receives the FIN, but its application + does not close the connection until all data has been queued for + transmission. Since the server will not transmit a FIN until all + the preceding data has been transmitted, deadlock results if the + client TCP does not consume the pending data or tear down the + connection: the window decreases to zero, since the client cannot + pass the data to the application, and the server sends probe + segments. The client acknowledges the probe segments with a zero + window. As mandated in RFC1122 4.2.2.17, the probe segments are + transmitted forever. Server connection state remains in + CLOSE_WAIT, and eventually server processes are exhausted. + + Note that there are two bugs. First, probe segments should be + ignored if the window can never subsequently increase. Second, a + RST should be sent when data is received after half duplex close. + Fixing the first bug, but not the second, results in the probe + segments eventually timing out the connection, but the server + remains in CLOSE_WAIT for a significant and unnecessary period. + + Relevant RFCs + RFC 1122 sections 4.2.2.13 and 4.2.2.17. + + Trace file demonstrating it + Made using an unknown network analyzer. No drop information + available. + + client.1391 > server.8080: S 0:1(0) ack: 0 win: 2000 + server.8080 > client.1391: SA 8c01:8c02(0) ack: 1 win: 8000 + client.1391 > server.8080: PA + client.1391 > server.8080: PA 1:1c2(1c1) ack: 8c02 win: 2000 + server.8080 > client.1391: [DF] PA 8c02:8cde(dc) ack: 1c2 win: 8000 + server.8080 > client.1391: [DF] A 8cde:9292(5b4) ack: 1c2 win: 8000 + server.8080 > client.1391: [DF] A 9292:9846(5b4) ack: 1c2 win: 8000 + server.8080 > client.1391: [DF] A 9846:9dfa(5b4) ack: 1c2 win: 8000 + client.1391 > server.8080: PA + server.8080 > client.1391: [DF] A 9dfa:a3ae(5b4) ack: 1c2 win: 8000 + server.8080 > client.1391: [DF] A a3ae:a962(5b4) ack: 1c2 win: 8000 + server.8080 > client.1391: [DF] A a962:af16(5b4) ack: 1c2 win: 8000 + server.8080 > client.1391: [DF] A af16:b4ca(5b4) ack: 1c2 win: 8000 + client.1391 > server.8080: PA + server.8080 > client.1391: [DF] A b4ca:ba7e(5b4) ack: 1c2 win: 8000 + server.8080 > client.1391: [DF] A b4ca:ba7e(5b4) ack: 1c2 win: 8000 + + + +Paxson, et. al. Informational [Page 48] + +RFC 2525 TCP Implementation Problems March 1999 + + + client.1391 > server.8080: PA + server.8080 > client.1391: [DF] A ba7e:bdfa(37c) ack: 1c2 win: 8000 + client.1391 > server.8080: PA + server.8080 > client.1391: [DF] A bdfa:bdfb(1) ack: 1c2 win: 8000 + client.1391 > server.8080: PA + + [ HTTP client aborts and enters FIN_WAIT_1 ] + + client.1391 > server.8080: FPA + + [ server ACKs the FIN and enters CLOSE_WAIT ] + + server.8080 > client.1391: [DF] A + + [ client enters FIN_WAIT_2 ] + + server.8080 > client.1391: [DF] A bdfa:bdfb(1) ack: 1c3 win: 8000 + + [ server continues to try to send its data ] + + client.1391 > server.8080: PA < window = 0 > + server.8080 > client.1391: [DF] A bdfa:bdfb(1) ack: 1c3 win: 8000 + client.1391 > server.8080: PA < window = 0 > + server.8080 > client.1391: [DF] A bdfa:bdfb(1) ack: 1c3 win: 8000 + client.1391 > server.8080: PA < window = 0 > + server.8080 > client.1391: [DF] A bdfa:bdfb(1) ack: 1c3 win: 8000 + client.1391 > server.8080: PA < window = 0 > + server.8080 > client.1391: [DF] A bdfa:bdfb(1) ack: 1c3 win: 8000 + client.1391 > server.8080: PA < window = 0 > + + [ ... repeat ad exhaustium ... ] + + Trace file demonstrating correct behavior + Made using an unknown network analyzer. No drop information + available. + + client > server D=80 S=59500 Syn Seq=337 Len=0 Win=8760 + server > client D=59500 S=80 Syn Ack=338 Seq=80153 Len=0 Win=8760 + client > server D=80 S=59500 Ack=80154 Seq=338 Len=0 Win=8760 + + [ ... normal data omitted ... ] + + client > server D=80 S=59500 Ack=14559 Seq=596 Len=0 Win=8760 + server > client D=59500 S=80 Ack=596 Seq=114559 Len=1460 Win=8760 + + [ client closes connection ] + + client > server D=80 S=59500 Fin Seq=596 Len=0 Win=8760 + + + +Paxson, et. al. Informational [Page 49] + +RFC 2525 TCP Implementation Problems March 1999 + + + server > client D=59500 S=80 Ack=597 Seq=116019 Len=1460 Win=8760 + + [ client sends RST (RFC1122 4.2.2.13) ] + + client > server D=80 S=59500 Rst Seq=597 Len=0 Win=0 + server > client D=59500 S=80 Ack=597 Seq=117479 Len=1460 Win=8760 + client > server D=80 S=59500 Rst Seq=597 Len=0 Win=0 + server > client D=59500 S=80 Ack=597 Seq=118939 Len=1460 Win=8760 + client > server D=80 S=59500 Rst Seq=597 Len=0 Win=0 + server > client D=59500 S=80 Ack=597 Seq=120399 Len=892 Win=8760 + client > server D=80 S=59500 Rst Seq=597 Len=0 Win=0 + server > client D=59500 S=80 Ack=597 Seq=121291 Len=1460 Win=8760 + client > server D=80 S=59500 Rst Seq=597 Len=0 Win=0 + + "client" sends a number of RSTs, one in response to each incoming + packet from "server". One might wonder why "server" keeps sending + data packets after it has received a RST from "client"; the + explanation is that "server" had already transmitted all five of + the data packets before receiving the first RST from "client", so + it is too late to avoid transmitting them. + + How to detect + The problem can be detected by inspecting packet traces of a + large, interrupted bulk transfer. + +2.17. + + Name of Problem + Failure to RST on close with data pending + + Classification + Resource management + + Description + When an application closes a connection in such a way that it can + no longer read any received data, the TCP SHOULD, per section + 4.2.2.13 of RFC 1122, send a RST if there is any unread received + data, or if any new data is received. A TCP that fails to do so + exhibits "Failure to RST on close with data pending". + + Note that, for some TCPs, this situation can be caused by an + application "crashing" while a peer is sending data. + + We have observed a number of TCPs that exhibit this problem. The + problem is less serious if any subsequent data sent to the now- + closed connection endpoint elicits a RST (see illustration below). + + + + + +Paxson, et. al. Informational [Page 50] + +RFC 2525 TCP Implementation Problems March 1999 + + + Significance + This problem is most significant for endpoints that engage in + large numbers of connections, as their ability to do so will be + curtailed as they leak away resources. + + Implications + Failure to reset the connection can lead to permanently hung + connections, in which the remote endpoint takes no further action + to tear down the connection because it is waiting on the local TCP + to first take some action. This is particularly the case if the + local TCP also allows the advertised window to go to zero, and + fails to tear down the connection when the remote TCP engages in + "persist" probes (see example below). + + Relevant RFCs + RFC 1122 section 4.2.2.13. Also, 4.2.2.17 for the zero-window + probing discussion below. + + Trace file demonstrating it + Made using tcpdump. No drop information available. + + 13:11:46.04 A > B: S 458659166:458659166(0) win 4096 + (DF) + 13:11:46.04 B > A: S 792320000:792320000(0) ack 458659167 + win 4096 + 13:11:46.04 A > B: . ack 1 win 4096 (DF) + 13:11.55.80 A > B: . 1:513(512) ack 1 win 4096 (DF) + 13:11.55.80 A > B: . 513:1025(512) ack 1 win 4096 (DF) + 13:11:55.83 B > A: . ack 1025 win 3072 + 13:11.55.84 A > B: . 1025:1537(512) ack 1 win 4096 (DF) + 13:11.55.84 A > B: . 1537:2049(512) ack 1 win 4096 (DF) + 13:11.55.85 A > B: . 2049:2561(512) ack 1 win 4096 (DF) + 13:11:56.03 B > A: . ack 2561 win 1536 + 13:11.56.05 A > B: . 2561:3073(512) ack 1 win 4096 (DF) + 13:11.56.06 A > B: . 3073:3585(512) ack 1 win 4096 (DF) + 13:11.56.06 A > B: . 3585:4097(512) ack 1 win 4096 (DF) + 13:11:56.23 B > A: . ack 4097 win 0 + 13:11:58.16 A > B: . 4096:4097(1) ack 1 win 4096 (DF) + 13:11:58.16 B > A: . ack 4097 win 0 + 13:12:00.16 A > B: . 4096:4097(1) ack 1 win 4096 (DF) + 13:12:00.16 B > A: . ack 4097 win 0 + 13:12:02.16 A > B: . 4096:4097(1) ack 1 win 4096 (DF) + 13:12:02.16 B > A: . ack 4097 win 0 + 13:12:05.37 A > B: . 4096:4097(1) ack 1 win 4096 (DF) + 13:12:05.37 B > A: . ack 4097 win 0 + 13:12:06.36 B > A: F 1:1(0) ack 4097 win 0 + 13:12:06.37 A > B: . ack 2 win 4096 (DF) + 13:12:11.78 A > B: . 4096:4097(1) ack 2 win 4096 (DF) + + + +Paxson, et. al. Informational [Page 51] + +RFC 2525 TCP Implementation Problems March 1999 + + + 13:12:11.78 B > A: . ack 4097 win 0 + 13:12:24.59 A > B: . 4096:4097(1) ack 2 win 4096 (DF) + 13:12:24.60 B > A: . ack 4097 win 0 + 13:12:50.22 A > B: . 4096:4097(1) ack 2 win 4096 (DF) + 13:12:50.22 B > A: . ack 4097 win 0 + + Machine B in the trace above does not drop received data when the + socket is "closed" by the application (in this case, the + application process was terminated). This occurred at + approximately 13:12:06.36 and resulted in the FIN being sent in + response to the close. However, because there is no longer an + application to deliver the data to, the TCP should have instead + sent a RST. + + Note: Machine A's zero-window probing is also broken. It is + resending old data, rather than new data. Section 3.7 in RFC 793 + and Section 4.2.2.17 in RFC 1122 discuss zero-window probing. + + Trace file demonstrating better behavior + Made using tcpdump. No drop information available. + + Better, but still not fully correct, behavior, per the discussion + below. We show this behavior because it has been observed for a + number of different TCP implementations. + + 13:48:29.24 C > D: S 73445554:73445554(0) win 4096 + (DF) + 13:48:29.24 D > C: S 36050296:36050296(0) ack 73445555 + win 4096 (DF) + 13:48:29.25 C > D: . ack 1 win 4096 (DF) + 13:48:30.78 C > D: . 1:1461(1460) ack 1 win 4096 (DF) + 13:48:30.79 C > D: . 1461:2921(1460) ack 1 win 4096 (DF) + 13:48:30.80 D > C: . ack 2921 win 1176 (DF) + 13:48:32.75 C > D: . 2921:4097(1176) ack 1 win 4096 (DF) + 13:48:32.82 D > C: . ack 4097 win 0 (DF) + 13:48:34.76 C > D: . 4096:4097(1) ack 1 win 4096 (DF) + 13:48:34.84 D > C: . ack 4097 win 0 (DF) + 13:48:36.34 D > C: FP 1:1(0) ack 4097 win 4096 (DF) + 13:48:36.34 C > D: . 4097:5557(1460) ack 2 win 4096 (DF) + 13:48:36.34 D > C: R 36050298:36050298(0) win 24576 + 13:48:36.34 C > D: . 5557:7017(1460) ack 2 win 4096 (DF) + 13:48:36.34 D > C: R 36050298:36050298(0) win 24576 + + In this trace, the application process is terminated on Machine D + at approximately 13:48:36.34. Its TCP sends the FIN with the + window opened again (since it discarded the previously received + data). Machine C promptly sends more data, causing Machine D to + + + + +Paxson, et. al. Informational [Page 52] + +RFC 2525 TCP Implementation Problems March 1999 + + + reset the connection since it cannot deliver the data to the + application. Ideally, Machine D SHOULD send a RST instead of + dropping the data and re-opening the receive window. + + Note: Machine C's zero-window probing is broken, the same as in + the example above. + + Trace file demonstrating correct behavior + Made using tcpdump. No losses reported by the packet filter. + + 14:12:02.19 E > F: S 1143360000:1143360000(0) win 4096 + 14:12:02.19 F > E: S 1002988443:1002988443(0) ack 1143360001 + win 4096 (DF) + 14:12:02.19 E > F: . ack 1 win 4096 + 14:12:10.43 E > F: . 1:513(512) ack 1 win 4096 + 14:12:10.61 F > E: . ack 513 win 3584 (DF) + 14:12:10.61 E > F: . 513:1025(512) ack 1 win 4096 + 14:12:10.61 E > F: . 1025:1537(512) ack 1 win 4096 + 14:12:10.81 F > E: . ack 1537 win 2560 (DF) + 14:12:10.81 E > F: . 1537:2049(512) ack 1 win 4096 + 14:12:10.81 E > F: . 2049:2561(512) ack 1 win 4096 + 14:12:10.81 E > F: . 2561:3073(512) ack 1 win 4096 + 14:12:11.01 F > E: . ack 3073 win 1024 (DF) + 14:12:11.01 E > F: . 3073:3585(512) ack 1 win 4096 + 14:12:11.01 E > F: . 3585:4097(512) ack 1 win 4096 + 14:12:11.21 F > E: . ack 4097 win 0 (DF) + 14:12:15.88 E > F: . 4097:4098(1) ack 1 win 4096 + 14:12:16.06 F > E: . ack 4097 win 0 (DF) + 14:12:20.88 E > F: . 4097:4098(1) ack 1 win 4096 + 14:12:20.91 F > E: . ack 4097 win 0 (DF) + 14:12:21.94 F > E: R 1002988444:1002988444(0) win 4096 + + When the application terminates at 14:12:21.94, F immediately + sends a RST. + + Note: Machine E's zero-window probing is (finally) correct. + + How to detect + The problem can often be detected by inspecting packet traces of a + transfer in which the receiving application terminates abnormally. + When doing so, there can be an ambiguity (if only looking at the + trace) as to whether the receiving TCP did indeed have unread data + that it could now no longer deliver. To provoke this to happen, + it may help to suspend the receiving application so that it fails + to consume any data, eventually exhausting the advertised window. + At this point, since the advertised window is zero, we know that + + + + + +Paxson, et. al. Informational [Page 53] + +RFC 2525 TCP Implementation Problems March 1999 + + + the receiving TCP has undelivered data buffered up. Terminating + the application process then should suffice to test the + correctness of the TCP's behavior. + +2.18. + + Name of Problem + Options missing from TCP MSS calculation + + Classification + Reliability / performance + + Description + When a TCP determines how much data to send per packet, it + calculates a segment size based on the MTU of the path. It must + then subtract from that MTU the size of the IP and TCP headers in + the packet. If IP options and TCP options are not taken into + account correctly in this calculation, the resulting segment size + may be too large. TCPs that do so are said to exhibit "Options + missing from TCP MSS calculation". + + Significance + In some implementations, this causes the transmission of strangely + fragmented packets. In some implementations with Path MTU (PMTU) + discovery [RFC1191], this problem can actually result in a total + failure to transmit any data at all, regardless of the environment + (see below). + + Arguably, especially since the wide deployment of firewalls, IP + options appear only rarely in normal operations. + + Implications + In implementations using PMTU discovery, this problem can result + in packets that are too large for the output interface, and that + have the DF (don't fragment) bit set in the IP header. Thus, the + IP layer on the local machine is not allowed to fragment the + packet to send it out the interface. It instead informs the TCP + layer of the correct MTU size of the interface; the TCP layer + again miscomputes the MSS by failing to take into account the size + of IP options; and the problem repeats, with no data flowing. + + Relevant RFCs + RFC 1122 describes the calculation of the effective send MSS. RFC + 1191 describes Path MTU discovery. + + + + + + + +Paxson, et. al. Informational [Page 54] + +RFC 2525 TCP Implementation Problems March 1999 + + + Trace file demonstrating it + Trace file taking using tcpdump on host C. The first trace + demonstrates the fragmentation that occurs without path MTU + discovery: + + 13:55:25.488728 A.65528 > C.discard: + P 567833:569273(1440) ack 1 win 17520 + + (frag 20828:1472@0+) + (ttl 62, optlen=8 LSRR{B#} NOP) + + 13:55:25.488943 A > C: + (frag 20828:8@1472) + (ttl 62, optlen=8 LSRR{B#} NOP) + + 13:55:25.489052 C.discard > A.65528: + . ack 566385 win 60816 + (DF) + (ttl 60, id 41266) + + Host A repeatedly sends 1440-octet data segments, but these hare + fragmented into two packets, one with 1432 octets of data, and + another with 8 octets of data. + + The second trace demonstrates the failure to send any data + segments, sometimes seen with hosts doing path MTU discovery: + + 13:55:44.332219 A.65527 > C.discard: + S 1018235390:1018235390(0) win 16384 + (DF) + (ttl 62, id 20912, optlen=8 LSRR{B#} NOP) + + 13:55:44.333015 C.discard > A.65527: + S 1271629000:1271629000(0) ack 1018235391 win 60816 + (DF) + (ttl 60, id 41427) + + 13:55:44.333206 C.discard > A.65527: + S 1271629000:1271629000(0) ack 1018235391 win 60816 + (DF) + (ttl 60, id 41427) + + This is all of the activity seen on this connection. Eventually + host C will time out attempting to establish the connection. + + How to detect + The "netcat" utility [Hobbit96] is useful for generating source + routed packets: + + + +Paxson, et. al. Informational [Page 55] + +RFC 2525 TCP Implementation Problems March 1999 + + + 1% nc C discard + (interactive typing) + ^C + 2% nc C discard < /dev/zero + ^C + 3% nc -g B C discard + (interactive typing) + ^C + 4% nc -g B C discard < /dev/zero + ^C + + Lines 1 through 3 should generate appropriate packets, which can + be verified using tcpdump. If the problem is present, line 4 + should generate one of the two kinds of packet traces shown. + + How to fix + The implementation should ensure that the effective send MSS + calculation includes a term for the IP and TCP options, as + mandated by RFC 1122. + +3. Security Considerations + + This memo does not discuss any specific security-related TCP + implementation problems, as the working group decided to pursue + documenting those in a separate document. Some of the implementation + problems discussed here, however, can be used for denial-of-service + attacks. Those classified as congestion control present + opportunities to subvert TCPs used for legitimate data transfer into + excessively loading network elements. Those classified as + "performance", "reliability" and "resource management" may be + exploitable for launching surreptitious denial-of-service attacks + against the user of the TCP. Both of these types of attacks can be + extremely difficult to detect because in most respects they look + identical to legitimate network traffic. + +4. Acknowledgements + + Thanks to numerous correspondents on the tcp-impl mailing list for + their input: Steve Alexander, Larry Backman, Jerry Chu, Alan Cox, + Kevin Fall, Richard Fox, Jim Gettys, Rick Jones, Allison Mankin, Neal + McBurnett, Perry Metzger, der Mouse, Thomas Narten, Andras Olah, + Steve Parker, Francesco Potorti`, Luigi Rizzo, Allyn Romanow, Al + Smith, Jerry Toporek, Joe Touch, and Curtis Villamizar. + + Thanks also to Josh Cohen for the traces documenting the "Failure to + send a RST after Half Duplex Close" problem; and to John Polstra, who + analyzed the "Window probe deadlock" problem. + + + + +Paxson, et. al. Informational [Page 56] + +RFC 2525 TCP Implementation Problems March 1999 + + +5. References + + [Allman97] M. Allman, "Fixing Two BSD TCP Bugs," Technical Report + CR-204151, NASA Lewis Research Center, Oct. 1997. + http://roland.grc.nasa.gov/~mallman/papers/bug.ps + + [RFC2414] Allman, M., Floyd, S. and C. Partridge, "Increasing + TCP's Initial Window", RFC 2414, September 1998. + + [RFC1122] Braden, R., Editor, "Requirements for Internet Hosts -- + Communication Layers", STD 3, RFC 1122, October 1989. + + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate + Requirement Levels", BCP 14, RFC 2119, March 1997. + + [Brakmo95] L. Brakmo and L. Peterson, "Performance Problems in + BSD4.4 TCP," ACM Computer Communication Review, + 25(5):69-86, 1995. + + [RFC813] Clark, D., "Window and Acknowledgement Strategy in TCP," + RFC 813, July 1982. + + [Dawson97] S. Dawson, F. Jahanian, and T. Mitton, "Experiments on + Six Commercial TCP Implementations Using a Software + Fault Injection Tool," to appear in Software Practice & + Experience, 1997. A technical report version of this + paper can be obtained at + ftp://rtcl.eecs.umich.edu/outgoing/sdawson/CSE-TR-298- + 96.ps.gz. + + [Fall96] K. Fall and S. Floyd, "Simulation-based Comparisons of + Tahoe, Reno, and SACK TCP," ACM Computer Communication + Review, 26(3):5-21, 1996. + + [Hobbit96] Hobbit, Avian Research, netcat, available via anonymous + ftp to ftp.avian.org, 1996. + + [Hoe96] J. Hoe, "Improving the Start-up Behavior of a Congestion + Control Scheme for TCP," Proc. SIGCOMM '96. + + [Jacobson88] V. Jacobson, "Congestion Avoidance and Control," Proc. + SIGCOMM '88. ftp://ftp.ee.lbl.gov/papers/congavoid.ps.Z + + [Jacobson89] V. Jacobson, C. Leres, and S. McCanne, tcpdump, + available via anonymous ftp to ftp.ee.lbl.gov, Jun. + 1989. + + + + + +Paxson, et. al. Informational [Page 57] + +RFC 2525 TCP Implementation Problems March 1999 + + + [RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP + Selective Acknowledgement Options", RFC 2018, October + 1996. + + [RFC1191] Mogul, J. and S. Deering, "Path MTU discovery", RFC + 1191, November 1990. + + [RFC896] Nagle, J., "Congestion Control in IP/TCP Internetworks", + RFC 896, January 1984. + + [Paxson97] V. Paxson, "Automated Packet Trace Analysis of TCP + Implementations," Proc. SIGCOMM '97, available from + ftp://ftp.ee.lbl.gov/papers/vp-tcpanaly-sigcomm97.ps.Z. + + [RFC793] Postel, J., Editor, "Transmission Control Protocol," STD + 7, RFC 793, September 1981. + + [RFC2001] Stevens, W., "TCP Slow Start, Congestion Avoidance, Fast + Retransmit, and Fast Recovery Algorithms", RFC 2001, + January 1997. + + [Stevens94] W. Stevens, "TCP/IP Illustrated, Volume 1", Addison- + Wesley Publishing Company, Reading, Massachusetts, 1994. + + [Wright95] G. Wright and W. Stevens, "TCP/IP Illustrated, Volume + 2", Addison-Wesley Publishing Company, Reading + Massachusetts, 1995. + +6. Authors' Addresses + + Vern Paxson + ACIRI / ICSI + 1947 Center Street + Suite 600 + Berkeley, CA 94704-1198 + + Phone: +1 510/642-4274 x302 + EMail: vern@aciri.org + + + + + + + + + + + + + +Paxson, et. al. Informational [Page 58] + +RFC 2525 TCP Implementation Problems March 1999 + + + Mark Allman + NASA Glenn Research Center/Sterling Software + Lewis Field + 21000 Brookpark Road + MS 54-2 + Cleveland, OH 44135 + USA + + Phone: +1 216/433-6586 + Email: mallman@grc.nasa.gov + + Scott Dawson + Real-Time Computing Laboratory + EECS Building + University of Michigan + Ann Arbor, MI 48109-2122 + USA + + Phone: +1 313/763-5363 + EMail: sdawson@eecs.umich.edu + + + William C. Fenner + Xerox PARC + 3333 Coyote Hill Road + Palo Alto, CA 94304 + USA + + Phone: +1 650/812-4816 + EMail: fenner@parc.xerox.com + + + Jim Griner + NASA Glenn Research Center + Lewis Field + 21000 Brookpark Road + MS 54-2 + Cleveland, OH 44135 + USA + + Phone: +1 216/433-5787 + EMail: jgriner@grc.nasa.gov + + + + + + + + + +Paxson, et. al. Informational [Page 59] + +RFC 2525 TCP Implementation Problems March 1999 + + + Ian Heavens + Spider Software Ltd. + 8 John's Place, Leith + Edinburgh EH6 7EL + UK + + Phone: +44 131/475-7015 + EMail: ian@spider.com + + Kevin Lahey + NASA Ames Research Center/MRJ + MS 258-6 + Moffett Field, CA 94035 + USA + + Phone: +1 650/604-4334 + EMail: kml@nas.nasa.gov + + + Jeff Semke + Pittsburgh Supercomputing Center + 4400 Fifth Ave + Pittsburgh, PA 15213 + USA + + Phone: +1 412/268-4960 + EMail: semke@psc.edu + + + Bernie Volz + Process Software Corporation + 959 Concord Street + Framingham, MA 01701 + USA + + Phone: +1 508/879-6994 + EMail: volz@process.com + + + + + + + + + + + + + + +Paxson, et. al. Informational [Page 60] + +RFC 2525 TCP Implementation Problems March 1999 + + +7. Full Copyright Statement + + Copyright (C) The Internet Society (1999). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + + + + + + + + + + + + + + + + + + + + + + + + +Paxson, et. al. Informational [Page 61] + diff --git a/ext/picotcp/RFC/rfc2757.txt b/ext/picotcp/RFC/rfc2757.txt new file mode 100644 index 0000000..e49f141 --- /dev/null +++ b/ext/picotcp/RFC/rfc2757.txt @@ -0,0 +1,2579 @@ + + + + + + +Network Working Group G. Montenegro +Request for Comments: 2757 Sun Microsystems, Inc. +Category: Informational S. Dawkins + Nortel Networks + M. Kojo + University of Helsinki + V. Magret + Alcatel + N. Vaidya + Texas A&M University + January 2000 + + + Long Thin Networks + +Status of this Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard of any kind. Distribution of this + memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2000). All Rights Reserved. + +Abstract + + In view of the unpredictable and problematic nature of long thin + networks (for example, wireless WANs), arriving at an optimized + transport is a daunting task. We have reviewed the existing + proposals along with future research items. Based on this overview, + we also recommend mechanisms for implementation in long thin + networks. + + Our goal is to identify a TCP that works for all users, including + users of long thin networks. We started from the working + recommendations of the IETF TCP Over Satellite Links (tcpsat) working + group with this end in mind. + + We recognize that not every tcpsat recommendation will be required + for long thin networks as well, and work toward a set of TCP + recommendations that are 'benign' in environments that do not require + them. + + + + + + + + +Montenegro, et al. Informational [Page 1] + +RFC 2757 Long Thin Networks January 2000 + + +Table of Contents + + 1 Introduction ................................................. 3 + 1.1 Network Architecture .................................... 5 + 1.2 Assumptions about the Radio Link ........................ 6 + 2 Should it be IP or Not? ..................................... 7 + 2.1 Underlying Network Error Characteristics ................ 7 + 2.2 Non-IP Alternatives ..................................... 8 + 2.2.1 WAP ................................................ 8 + 2.2.2 Deploying Non-IP Alternatives ...................... 9 + 2.3 IP-based Considerations ................................. 9 + 2.3.1 Choosing the MTU [Stevens94, RFC1144] .............. 9 + 2.3.2 Path MTU Discovery [RFC1191] ....................... 10 + 2.3.3 Non-TCP Proposals .................................. 10 + 3 The Case for TCP ............................................. 11 + 4 Candidate Optimizations ...................................... 12 + 4.1 TCP: Current Mechanisms ................................. 12 + 4.1.1 Slow Start and Congestion Avoidance ................ 12 + 4.1.2 Fast Retransmit and Fast Recovery .................. 12 + 4.2 Connection Setup with T/TCP [RFC1397, RFC1644] .......... 14 + 4.3 Slow Start Proposals .................................... 14 + 4.3.1 Larger Initial Window .............................. 14 + 4.3.2 Growing the Window during Slow Start ............... 15 + 4.3.2.1 ACK Counting .................................. 15 + 4.3.2.2 ACK-every-segment ............................. 16 + 4.3.3 Terminating Slow Start ............................. 17 + 4.3.4 Generating ACKs during Slow Start .................. 17 + 4.4 ACK Spacing ............................................. 17 + 4.5 Delayed Duplicate Acknowlegements ....................... 18 + 4.6 Selective Acknowledgements [RFC2018] .................... 18 + 4.7 Detecting Corruption Loss ............................... 19 + 4.7.1 Without Explicit Notification ...................... 19 + 4.7.2 With Explicit Notifications ........................ 20 + 4.8 Active Queue Management ................................. 21 + 4.9 Scheduling Algorithms ................................... 21 + 4.10 Split TCP and Performance-Enhancing Proxies (PEPs) ..... 22 + 4.10.1 Split TCP Approaches .............................. 23 + 4.10.2 Application Level Proxies ......................... 26 + 4.10.3 Snoop and its Derivatives ......................... 27 + 4.10.4 PEPs to handle Periods of Disconnection ........... 29 + 4.11 Header Compression Alternatives ........................ 30 + 4.12 Payload Compression .................................... 31 + 4.13 TCP Control Block Interdependence [Touch97] ............ 32 + 5 Summary of Recommended Optimizations ......................... 33 + 6 Conclusion ................................................... 35 + 7 Acknowledgements ............................................. 35 + 8 Security Considerations ...................................... 35 + + + + +Montenegro, et al. Informational [Page 2] + +RFC 2757 Long Thin Networks January 2000 + + + 9 References ................................................... 36 + Authors' Addresses ............................................. 44 + Full Copyright Statement ....................................... 46 + +1 Introduction + + Optimized wireless networking is one of the major hurdles that Mobile + Computing must solve if it is to enable ubiquitous access to + networking resources. However, current data networking protocols have + been optimized primarily for wired networks. Wireless environments + have very different characteristics in terms of latency, jitter, and + error rate as compared to wired networks. Accordingly, traditional + protocols are ill-suited to this medium. + + Mobile Wireless networks can be grouped in W-LANs (for example, + 802.11 compliant networks) and W-WANs (for example, CDPD [CDPD], + Ricochet, CDMA [CDMA], PHS, DoCoMo, GSM [GSM] to name a few). W-WANs + present the most serious challenge, given that the length of the + wireless link (expressed as the delay*bandwidth product) is typically + 4 to 5 times as long as that of its W-LAN counterparts. For example, + for an 802.11 network, assuming the delay (round-trip time) is about + 3 ms. and the bandwidth is 1.5 Mbps, the delay*bandwidth product is + 4500 bits. For a W-WAN such as Ricochet, a typical round-trip time + may be around 500 ms. (the best is about 230 ms.), and the sustained + bandwidth is about 24 Kbps. This yields a delay*bandwidth product + roughly equal to 1.5 KB. In the near future, 3rd Generation wireless + services will offer 384Kbps and more. Assuming a 200 ms round-trip, + the delay*bandwidth product in this case is 76.8 Kbits (9.6 KB). This + value is larger than the default 8KB buffer space used by many TCP + implementations. This means that, whereas for W-LANs the default + buffer space is enough, future W-WANs will operate inefficiently + (that is, they will not be able to fill the pipe) unless they + override the default value. A 3rd Generation wireless service + offering 2 Mbps with 200-millisecond latency requires a 50 KB buffer. + + Most importantly, latency across a link adversely affects + throughput. For example, [MSMO97] derives an upper bound on TCP + throughput. Indeed, the resultant expression is inversely related to + the round-trip time. + + The long latencies also push the limits (and commonly transgress + them) for what is acceptable to users of interactive applications. + + As a quick glance to our list of references will reveal, there is a + wealth of proposals that attempt to solve the wireless networking + problem. In this document, we survey the different solutions + available or under investigation, and issue the corresponding + recommendations. + + + +Montenegro, et al. Informational [Page 3] + +RFC 2757 Long Thin Networks January 2000 + + + There is a large body of work on the subject of improving TCP + performance over satellite links. The documents under development by + the tcpsat working group of the IETF [AGS98, ADGGHOSSTT98] are very + relevant. In both cases, it is essential to start by improving the + characteristics of the medium by using forward error correction (FEC) + at the link layer to reduce the BER (bit error rate) from values as + high as 10-3 to 10-6 or better. This makes the BER manageable. Once + in this realm, retransmission schemes like ARQ (automatic repeat + request) may be used to bring it down even further. Notice that + sometimes it may be desirable to forego ARQ because of the additional + delay it implies. In particular, time sensitive traffic (video, + audio) must be delivered within a certain time limit beyond which the + data is obsolete. Exhaustive retransmissions in this case merely + succeed in wasting time in order to deliver data that will be + discarded once it arrives at its destination. This indicates the + desirability of augmenting the protocol stack implementation on + devices such that the upper protocol layers can inform the link and + MAC layer when to avoid such costly retransmission schemes. + + Networks that include satellite links are examples of "long fat + networks" (LFNs or "elephants"). They are "long" networks because + their round-trip time is quite high (for example, 0.5 sec and higher + for geosynchronous satellites). Not all satellite links fall within + the LFN regime. In particular, round-trip times in a low-earth + orbiting (LEO) satellite network may be as little as a few + milliseconds (and never extend beyond 160 to 200 ms). W-WANs share + the "L" with LFNs. However, satellite networks are also "fat" in the + sense that they may have high bandwidth. Satellite networks may often + have a delay*bandwidth product above 64 KBytes, in which case they + pose additional problems to TCP [TCPHP]. W-WANs do not generally + exhibit this behavior. Accordingly, this document only deals with + links that are "long thin pipes", and the networks that contain them: + "long thin networks". We call these "LTNs". + + This document does not give an overview of the API used to access the + underlying transport. We believe this is an orthogonal issue, even + though some of the proposals below have been put forth assuming a + given interface. It is possible, for example, to support the + traditional socket semantics without fully relying on TCP/IP + transport [MOWGLI]. + + Our focus is on the on-the-wire protocols. We try to include the most + relevant ones and briefly (given that we provide the references + needed for further study) mention their most salient points. + + + + + + + +Montenegro, et al. Informational [Page 4] + +RFC 2757 Long Thin Networks January 2000 + + +1.1 Network Architecture + + One significant difference between LFNs and LTNs is that we assume + the W-WAN link is the last hop to the end user. This allows us to + assume that a single intermediate node sees all packets transferred + between the wireless mobile device and the rest of the Internet. + This is only one of the topologies considered by the TCP Satellite + community. + + Given our focus on mobile wireless applications, we only consider a + very specific architecture that includes: + + - a wireless mobile device, connected via + + - a wireless link (which may, in fact comprise several hops at + the link layer), to + + - an intermediate node (sometimes referred to as a base station) + connected via + + - a wireline link, which in turn interfaces with + + - the landline Internet and millions of legacy servers and web + sites. + + Specifically, we are not as concerned with paths that include two + wireless segments separated by a wired one. This may occur, for + example, if one mobile device connects across its immediate wireless + segment via an intermediate node to the Internet, and then via a + second wireless segment to another mobile device. Quite often, + mobile devices connect to a legacy server on the wired Internet. + + Typically, the endpoints of the wireless segment are the intermediate + node and the mobile device. However, the latter may be a wireless + router to a mobile network. This is also important and has + applications in, for example, disaster recovery. + + Our target architecture has implications which concern the + deployability of candidate solutions. In particular, an important + requirement is that we cannot alter the networking stack on the + legacy servers. It would be preferable to only change the networking + stack at the intermediate node, although changing it at the mobile + devices is certainly an option and perhaps a necessity. + + We envision mobile devices that can use the wireless medium very + efficiently, but overcome some of its traditional constraints. That + is, full mobility implies that the devices have the flexibility and + agility to use whichever happens to be the best network connection + + + +Montenegro, et al. Informational [Page 5] + +RFC 2757 Long Thin Networks January 2000 + + + available at any given point in time or space. Accordingly, devices + could switch from a wired office LAN and hand over their ongoing + connections to continue on, say, a wireless WAN. This type of agility + also requires Mobile IP [RFC2002]. + +1.2 Assumptions about the Radio Link + + The system architecture described above assumes at most one wireless + link (perhaps comprising more than one wireless hop). However, this + is not enough to characterize a wireless link. Additional + considerations are: + + - What are the error characteristics of the wireless medium? The + link may present a higher BER than a wireline network due to + burst errors and disconnections. The techniques below usually + do not address all the types of errors. Accordingly, a complete + solution should combine the best of all the proposals. + Nevertheless, in this document we are more concerned with (and + give preference to solving) the most typical case: (1) higher + BER due to random errors (which implies longer and more + variable delays due to link-layer error corrections and + retransmissions) rather than (2) an interruption in service due + to a handoff or a disconnection. The latter are also important + and we do include relevant proposals in this survey. + + - Is the wireless service datagram oriented, or is it a virtual + circuit? Currently, switched virtual circuits are more common, + but packet networks are starting to appear, for example, + Metricom's Starmode [CB96], CDPD [CDPD] and General Packet + Radio Service (GPRS) [GPRS],[BW97] in GSM. + + - What kind of reliability does the link provide? Wireless + services typically retransmit a packet (frame) until it has + been acknowledged by the target. They may allow the user to + turn off this behavior. For example, GSM allows RLP [RLP] + (Radio Link Protocol) to be turned off. Metricom has a + similar "lightweight" mode. In GSM RLP, a frame is + retransmitted until the maximum number of retransmissions + (protocol parameter) is reached. What happens when this limit + is reached is determined by the telecom operator: the physical + link connection is either disconnected or a link reset is + enforced where the sequence numbers are resynchronized and the + transmit and receive buffers are flushed resulting in lost + data. Some wireless services, like CDMA IS95-RLP [CDMA, + Karn93], limit the latency on the wireless link by + retransmitting a frame only a couple of times. This decreases + the residual frame error rate significantly, but does not + provide fully reliable link service. + + + +Montenegro, et al. Informational [Page 6] + +RFC 2757 Long Thin Networks January 2000 + + + - Does the mobile device transmit and receive at the same time? + Doing so increases the cost of the electronics on the mobile + device. Typically, this is not the case. We assume in this + document that mobile devices do not transmit and receive + simultaneously. + + - Does the mobile device directly address more than one peer on + the wireless link? Packets to each different peer may traverse + spatially distinct wireless paths. Accordingly, the path to + each peer may exhibit very different characteristics. Quite + commonly, the mobile device addresses only one peer (the + intermediate node) at any given point in time. When this is + not the case, techniques such as Channel-State Dependent Packet + Scheduling come into play (see the section "Packet Scheduling" + below). + +2 Should it be IP or Not? + + The first decision is whether to use IP as the underlying network + protocol or not. In particular, some data protocols evolved from + wireless telephony are not always -- though at times they may be -- + layered on top of IP [MOWGLI, WAP]. These proposals are based on the + concept of proxies that provide adaptation services between the + wireless and wireline segments. + + This is a reasonable model for mobile devices that always communicate + through the proxy. However, we expect many wireless mobile devices to + utilize wireline networks whenever they are available. This model + closely follows current laptop usage patterns: devices typically + utilize LANs, and only resort to dial-up access when "out of the + office." + + For these devices, an architecture that assumes IP is the best + approach, because it will be required for communications that do not + traverse the intermediate node (for example, upon reconnection to a + W-LAN or a 10BaseT network at the office). + +2.1 Underlying Network Error Characteristics + + Using IP as the underlying network protocol requires a certain (low) + level of link robustness that is expected of wireless links. + + IP, and the protocols that are carried in IP packets, are protected + end-to-end by checksums that are relatively weak [Stevens94, + Paxson97] (and, in some cases, optional). For much of the Internet, + these checksums are sufficient; in wireless environments, the error + characteristics of the raw wireless link are much less robust than + the rest of the end-to-end path. Hence for paths that include + + + +Montenegro, et al. Informational [Page 7] + +RFC 2757 Long Thin Networks January 2000 + + + wireless links, exclusively relying on end-to-end mechanisms to + detect and correct transmission errors is undesirable. These should + be complemented by local link-level mechanisms. Otherwise, damaged IP + packets are propagated through the network only to be discarded at + the destination host. For example, intermediate routers are required + to check the IP header checksum, but not the UDP or TCP checksums. + Accordingly, when the payload of an IP packet is corrupted, this is + not detected until the packet arrives at its ultimate destination. + + A better approach is to use link-layer mechanisms such as FEC, + retransmissions, and so on in order to improve the characteristics of + the wireless link and present a much more reliable service to IP. + This approach has been taken by CDPD, Ricochet and CDMA. + + This approach is roughly analogous to the successful deployment of + Point-to-Point Protocol (PPP), with robust framing and 16-bit + checksumming, on wireline networks as a replacement for the Serial + Line Interface Protocol (SLIP), with only a single framing byte and + no checksumming. + + [AGS98] recommends the use of FEC in satellite environments. + + Notice that the link-layer could adapt its frame size to the + prevalent BER. It would perform its own fragmentation and reassembly + so that IP could still enjoy a large enough MTU size [LS98]. + + A common concern for using IP as a transport is the header overhead + it implies. Typically, the underlying link-layer appears as PPP + [RFC1661] to the IP layer above. This allows for header compression + schemes [IPHC, IPHC-RTP, IPHC-PPP] which greatly alleviate the + problem. + +2.2 Non-IP Alternatives + + A number of non-IP alternatives aimed at wireless environments have + been proposed. One representative proposal is discussed here. + +2.2.1 WAP + + The Wireless Application Protocol (WAP) specifies an application + framework and network protocols for wireless devices such as mobile + telephones, pagers, and PDAs [WAP]. The architecture requires a proxy + between the mobile device and the server. The WAP protocol stack is + layered over a datagram transport service. Such a service is + provided by most wireless networks; for example, IS-136, GSM + SMS/USSD, and UDP in IP networks like CDPD and GSM GPRS. The core of + + + + + +Montenegro, et al. Informational [Page 8] + +RFC 2757 Long Thin Networks January 2000 + + + the WAP protocols is a binary HTTP/1.1 protocol with additional + features such as header caching between requests and a shared state + between client and server. + +2.2.2 Deploying Non-IP Alternatives + + IP is such a fundamental element of the Internet that non-IP + alternatives face substantial obstacles to deployment, because they + do not exploit the IP infrastructure. Any non-IP alternative that is + used to provide gatewayed access to the Internet must map between IP + addresses and non-IP addresses, must terminate IP-level security at a + gateway, and cannot use IP-oriented discovery protocols (Dynamic Host + Configuration Protocol, Domain Name Services, Lightweight Directory + Access Protocol, Service Location Protocol, etc.) without translation + at a gateway. + + A further complexity occurs when a device supports both wireless and + wireline operation. If the device uses IP for wireless operation, + uninterrupted operation when the device is connected to a wireline + network is possible (using Mobile IP). If a non-IP alternative is + used, this switchover is more difficult to accomplish. + + Non-IP alternatives face the burden of proof that IP is so ill-suited + to a wireless environment that it is not a viable technology. + +2.3 IP-based Considerations + + Given its worldwide deployment, IP is an obvious choice for the + underlying network technology. Optimizations implemented at this + level benefit traditional Internet application protocols as well as + new ones layered on top of IP or UDP. + +2.3.1 Choosing the MTU [Stevens94, RFC1144] + + In slow networks, the time required to transmit the largest possible + packet may be considerable. Interactive response time should not + exceed the well-known human factors limit of 100 to 200 ms. This + should be considered the maximum time budget to (1) send a packet and + (2) obtain a response. In most networking stack implementations, (1) + is highly dependent on the maximum transmission unit (MTU). In the + worst case, a small packet from an interactive application may have + to wait for a large packet from a bulk transfer application before + being sent. Hence, a good rule of thumb is to choose an MTU such that + its transmission time is less than (or not much larger than) 200 ms. + + + + + + + +Montenegro, et al. Informational [Page 9] + +RFC 2757 Long Thin Networks January 2000 + + + Of course, compression and type-of-service queuing (whereby + interactive data packets are given a higher priority) may alleviate + this problem. In particular, the latter may reduce the average wait + time to about half the MTU's transmission time. + +2.3.2 Path MTU Discovery [RFC1191] + + Path MTU discovery benefits any protocol built on top of IP. It + allows a sender to determine what the maximum end-to-end transmission + unit is to a given destination. Without Path MTU discovery, the + default IPv4 MTU size is 576. The benefits of using a larger MTU are: + + - Smaller ratio of header overhead to data + + - Allows TCP to grow its congestion window faster, since it + increases in units of segments. + + Of course, for a given BER, a larger MTU has a correspondingly larger + probability of error within any given segment. The BER may be reduced + using lower level techniques like FEC and link-layer retransmissions. + The issue is that now delays may become a problem due to the + additional retransmissions, and the fact that packet transmission + time increases with a larger MTU. + + Recommendation: Path MTU discovery is recommended. [AGS98] already + recommends its use in satellite environments. + +2.3.3 Non-TCP Proposals + + Other proposals assume an underlying IP datagram service, and + implement an optimized transport either directly on top of IP + [NETBLT] or on top of UDP [MNCP]. Not relying on TCP is a bold move, + given the wealth of experience and research related to it. It could + be argued that the Internet has not collapsed because its main + protocol, TCP, is very careful in how it uses the network, and + generally treats it as a black box assuming all packet losses are due + to congestion and prudently backing off. This avoids further + congestion. + + However, in the wireless medium, packet losses may also be due to + corruption due to high BER, fading, and so on. Here, the right + approach is to try harder, instead of backing off. Alternative + transport protocols are: + + - NETBLT [NETBLT, RFC1986, RFC1030] + + - MNCP [MNCP] + + + + +Montenegro, et al. Informational [Page 10] + +RFC 2757 Long Thin Networks January 2000 + + + - ESRO [RFC2188] + + - RDP [RFC908, RFC1151] + + - VMTP [VMTP] + +3 The Case for TCP + + This is one of the most hotly debated issues in the wireless arena. + Here are some arguments against it: + + - It is generally recognized that TCP does not perform well in + the presence of significant levels of non-congestion loss. TCP + detractors argue that the wireless medium is one such case, and + that it is hard enough to fix TCP. They argue that it is easier + to start from scratch. + + - TCP has too much header overhead. + + - By the time the mechanisms are in place to fix it, TCP is very + heavy, and ill-suited for use by lightweight, portable devices. + + and here are some in support of TCP: + + - It is preferable to continue using the same protocol that the + rest of the Internet uses for compatibility reasons. Any + extensions specific to the wireless link may be negotiated. + + - Legacy mechanisms may be reused (for example three-way + handshake). + + - Link-layer FEC and ARQ can reduce the BER such that any losses + TCP does see are, in fact, caused by congestion (or a sustained + interruption of link connectivity). Modern W-WAN technologies + do this (CDPD, US-TDMA, CDMA, GSM), thus improving TCP + throughput. + + - Handoffs among different technologies are made possible by + Mobile IP [RFC2002], but only if the same protocols, namely + TCP/IP, are used throughout. + + - Given TCP's wealth of research and experience, alternative + protocols are relatively immature, and the full implications of + their widespread deployment not clearly understood. + + Overall, we feel that the performance of TCP over long-thin networks + can be improved significantly. Mechanisms to do so are discussed in + the next sections. + + + +Montenegro, et al. Informational [Page 11] + +RFC 2757 Long Thin Networks January 2000 + + +4 Candidate Optimizations + + There is a large volume of work on the subject of optimizing TCP for + operation over wireless media. Even though satellite networks + generally fall in the LFN regime, our current LTN focus has much to + benefit from it. For example, the work of the TCP-over-Satellite + working group of the IETF has been extremely helpful in preparing + this section [AGS98, ADGGHOSSTT98]. + +4.1 TCP: Current Mechanisms + + A TCP sender adapts its use of bandwidth based on feedback from the + receiver. The high latency characteristic of LTNs implies that TCP's + adaptation is correspondingly slower than on networks with shorter + delays. Similarly, delayed ACKs exacerbate the perceived latency on + the link. Given that TCP grows its congestion window in units of + segments, small MTUs may slow adaptation even further. + +4.1.1 Slow Start and Congestion Avoidance + + Slow Start and Congestion Avoidance [RFC2581] are essential the + Internet's stability. However there are two reasons why the wireless + medium adversely affects them: + + - Whenever TCP's retransmission timer expires, the sender assumes + that the network is congested and invokes slow start. This is + why it is important to minimize the losses caused by + corruption, leaving only those caused by congestion (as + expected by TCP). + + - The sender increases its window based on the number of ACKs + received. Their rate of arrival, of course, is dependent on the + RTT (round-trip-time) between sender and receiver, which + implies long ramp-up times in high latency links like LTNs. The + dependency lasts until the pipe is filled. + + - During slow start, the sender increases its window in units of + segments. This is why it is important to use an appropriately + large MTU which, in turn, requires requires link layers with + low loss. + +4.1.2 Fast Retransmit and Fast Recovery + + When a TCP sender receives several duplicate ACKs, fast retransmit + [RFC2581] allows it to infer that a segment was lost. The sender + retransmits what it considers to be this lost segment without waiting + for the full timeout, thus saving time. + + + + +Montenegro, et al. Informational [Page 12] + +RFC 2757 Long Thin Networks January 2000 + + + After a fast retransmit, a sender invokes the fast recovery [RFC2581] + algorithm. Fast recovery allows the sender to transmit at half its + previous rate (regulating the growth of its window based on + congestion avoidance), rather than having to begin a slow start. This + also saves time. + + In general, TCP can increase its window beyond the delay-bandwidth + product. However, in LTN links the congestion window may remain + rather small, less than four segments, for long periods of time due + to any of the following reasons: + + 1. Typical "file size" to be transferred over a connection is + relatively small (Web requests, Web document objects, email + messages, files, etc.) In particular, users of LTNs are not + very willing to carry out large transfers as the response time + is so long. + + 2. If the link has high BER, the congestion window tends to stay + small + + 3. When an LTN is combined with a highly congested wireline + Internet path, congestion losses on the Internet have the same + effect as 2. + + 4. Commonly, ISPs/operators configure only a small number of + buffers (even as few as for 3 packets) per user in their dial- + up routers + + 5. Often small socket buffers are recommended with LTNs in order + to prevent the RTO from inflating and to diminish the amount of + packets with competing traffic. + + A small window effectively prevents the sender from taking advantage + of Fast Retransmits. Moreover, efficient recovery from multiple + losses within a single window requires adoption of new proposals + (NewReno [RFC2582]). In addition, on slow paths with no packet + reordering waiting for three duplicate ACKs to arrive postpones + retransmission unnecessarily. + + Recommendation: Implement Fast Retransmit and Fast Recovery at this + time. This is a widely-implemented optimization and is currently at + Proposed Standard level. [AGS98] recommends implementation of Fast + Retransmit/Fast Recovery in satellite environments. NewReno + [RFC2582] apparently does help a sender better handle partial ACKs + and multiple losses in a single window, but at this point is not + recommended due to its experimental nature. Instead, SACK [RFC2018] + is the preferred mechanism. + + + + +Montenegro, et al. Informational [Page 13] + +RFC 2757 Long Thin Networks January 2000 + + +4.2 Connection Setup with T/TCP [RFC1397, RFC1644] + + TCP engages in a "three-way handshake" whenever a new connection is + set up. Data transfer is only possible after this phase has + completed successfully. T/TCP allows data to be exchanged in + parallel with the connection set up, saving valuable time for short + transactions on long-latency networks. + + Recommendation: T/TCP is not recommended, for these reasons: + + - It is an Experimental RFC. + + - It is not widely deployed, and it has to be deployed at both ends + of a connection. + + - Security concerns have been raised that T/TCP is more vulnerable + to address-spoofing attacks than TCP itself. + + - At least some of the benefits of T/TCP (eliminating three-way + handshake on subsequent query-response transactions, for instance) + are also available with persistent connections on HTTP/1.1, which + is more widely deployed. + + [ADGGHOSSTT98] does not have a recommendation on T/TCP in satellite + environments. + +4.3 Slow Start Proposals + + Because slow start dominates the network response seen by interactive + users at the beginning of a TCP connection, a number of proposals + have been made to modify or eliminate slow start in long latency + environments. + + Stability of the Internet is paramount, so these proposals must + demonstrate that they will not adversely affect Internet congestion + levels in significant ways. + +4.3.1 Larger Initial Window + + Traditional slow start, with an initial window of one segment, is a + time-consuming bandwidth adaptation procedure over LTNs. Studies on + an initial window larger than one segment [RFC2414, AHO98] resulted + in the TCP standard supporting a maximum value of 2 [RFC2581]. Higher + values are still experimental in nature. + + + + + + + +Montenegro, et al. Informational [Page 14] + +RFC 2757 Long Thin Networks January 2000 + + + In simulations with an increased initial window of three packets + [RFC2415], this proposal does not contribute significantly to packet + drop rates, and it has the added benefit of improving initial + response times when the peer device delays acknowledgements during + slow start (see next proposal). + + [RFC2416] addresses situations where the initial window exceeds the + number of buffers available to TCP and indicates that this situation + is no different from the case where the congestion window grows + beyond the number of buffers available. + + [RFC2581] now allows an initial congestion window of two segments. A + larger initial window, perhaps as many as four segments, might be + allowed in the future in environments where this significantly + improves performance (LFNs and LTNs). + + Recommendation: Implement this on devices now. The research on this + optimization indicates that 3 segments is a safe initial setting, and + is centering on choosing between 2, 3, and 4. For now, use 2 + (following RFC2581), which at least allows clients running query- + response applications to get an initial ACK from unmodified servers + without waiting for a typical delayed ACK timeout of 200 + milliseconds, and saves two round-trips. An initial window of 3 + [RFC2415] looks promising and may be adopted in the future pending + further research and experience. + +4.3.2 Growing the Window during Slow Start + + The sender increases its window based on the flow of ACKs coming back + from the receiver. Particularly during slow start, this flow is very + important. A couple of the proposals that have been studied are (1) + ACK counting and (2) ACK-every-segment. + +4.3.2.1 ACK Counting + + The main idea behind ACK counting is: + + - Make each ACK count to its fullest by growing the window based + on the data being acknowledged (byte counting) instead of the + number of ACKs (ACK counting). This has been shown to cause + bursts which lead to congestion. [Allman98] shows that Limited + Byte Counting (LBC), in which the window growth is limited to 2 + segments, does not lead to as much burstiness, and offers some + performance gains. + + Recommendation: Unlimited byte counting is not recommended. Van + Jacobson cautions against byte counting [TCPSATMIN] because it leads + to burstiness, and recommends ACK spacing [ACKSPACING] instead. + + + +Montenegro, et al. Informational [Page 15] + +RFC 2757 Long Thin Networks January 2000 + + + ACK spacing requires ACKs to consistently pass through a single ACK- + spacing router. This requirement works well for W-WAN environments + if the ACK-spacing router is also the intermediate node. + + Limited byte counting warrants further investigation before we can + recommend this proposal, but it shows promise. + +4.3.2.2 ACK-every-segment + + The main idea behind ACK-every-segment is: + + - Keep a constant stream of ACKs coming back by turning off + delayed ACKs [RFC1122] during slow start. ACK-every-segment + must be limited to slow start, in order to avoid penalizing + asymmetric-bandwidth configurations. For instance, a low + bandwidth link carrying acknowledgements back to the sender, + hinders the growth of the congestion window, even if the link + toward the client has a greater bandwidth [BPK99]. + + Even though simulations confirm its promise (it allows receivers to + receive the second segment from unmodified senders without waiting + for a typical delayed ACK timeout of 200 milliseconds), for this + technique to be practical the receiver must acknowledge every segment + only when the sender is in slow start. Continuing to do so when the + sender is in congestion avoidance may have adverse effects on the + mobile device's battery consumption and on traffic in the network. + + This violates a SHOULD in [RFC2581]: delayed acknowledgements SHOULD + be used by a TCP receiver. + + "Disabling Delayed ACKs During Slow Start" is technically + unimplementable, as the receiver has no way of knowing when the + sender crosses ssthresh (the "slow start threshold") and begins using + the congestion avoidance algorithm. If receivers follow + recommendations for increased initial windows, disabling delayed ACKs + during an increased initial window would open the TCP window more + rapidly without doubling ACK traffic in general. However, this + scheme might double ACK traffic if most connections remain in slow- + start. + + Recommendation: ACK only the first segment on a new connection with + no delay. + + + + + + + + + +Montenegro, et al. Informational [Page 16] + +RFC 2757 Long Thin Networks January 2000 + + +4.3.3 Terminating Slow Start + + New mechanisms [ADGGHOSSTT98] are being proposed to improve TCP's + adaptive properties such that the available bandwidth is better + utilized while reducing the possibility of congesting the network. + This results in the closing of the congestion window to 1 segment + (which precludes fast retransmit), and the subsequent slow start + phase. + + Theoretically, an optimum value for slow-start threshold (ssthresh) + allows connection bandwidth utilization to ramp up as aggressively as + possible without "overshoot" (using so much bandwidth that packets + are lost and congestion avoidance procedures are invoked). + + Recommendation: Estimating the slow start threshold is not + recommended. Although this would be helpful if we knew how to do it, + rough consensus on the tcp-impl and tcp-sat mailing lists is that in + non-trivial operational networks there is no reliable method to probe + during TCP startup and estimate the bandwidth available. + +4.3.4 Generating ACKs during Slow Start + + Mitigations that inject additional ACKs (whether "ACK-first-segment" + or "ACK-every-segment-during-slow-start") beyond what today's + conformant TCPs inject are only applicable during the slow-start + phases of a connection. After an initial exchange, the connection + usually completes slow-start, so TCPs only inject additional ACKs + when (1) the connection is closed, and a new connection is opened, or + (2) the TCPs handle idle connection restart correctly by performing + slow start. + + Item (1) is typical when using HTTP/1.0, in which each request- + response transaction requires a new connection. Persistent + connections in HTTP/1.1 help in maintaining a connection in + congestion avoidance instead of constantly reverting to slow-start. + Because of this, these optimizations which are only enabled during + slow-start do not get as much of a chance to act. Item (2), of + course, is independent of HTTP version. + +4.4 ACK Spacing + + During slow start, the sender responds to the incoming ACK stream by + transmitting N+1 segments for each ACK, where N is the number of new + segments acknowledged by the incoming ACK. This results in data + being sent at twice the speed at which it can be processed by the + network. Accordingly, queues will form, and due to insufficient + buffering at the bottleneck router, packets may get dropped before + the link's capacity is full. + + + +Montenegro, et al. Informational [Page 17] + +RFC 2757 Long Thin Networks January 2000 + + + Spacing out the ACKs effectively controls the rate at which the + sender will transmit into the network, and may result in little or no + queueing at the bottleneck router [ACKSPACING]. Furthermore, ack + spacing reduces the size of the bursts. + + Recommendation: No recommendation at this time. Continue monitoring + research in this area. + +4.5 Delayed Duplicate Acknowlegements + + As was mentioned above, link-layer retransmissions may decrease the + BER enough that congestion accounts for most of packet losses; still, + nothing can be done about interruptions due to handoffs, moving + beyond wireless coverage, etc. In this scenario, it is imperative to + prevent interaction between link-layer retransmission and TCP + retransmission as these layers duplicate each other's efforts. In + such an environment it may make sense to delay TCP's efforts so as to + give the link-layer a chance to recover. With this in mind, the + Delayed Dupacks [MV97, Vaidya99] scheme selectively delays duplicate + acknowledgements at the receiver. It is preferable to allow a local + mechanism to resolve a local problem, instead of invoking TCP's end- + to-end mechanism and incurring the associated costs, both in terms of + wasted bandwidth and in terms of its effect on TCP's window behavior. + + The Delayed Dupacks scheme can be used despite IP encryption since + the intermediate node does not need to examine the TCP headers. + + Currently, it is not well understood how long the receiver should + delay the duplicate acknowledgments. In particular, the impact of + wireless medium access control (MAC) protocol on the choice of delay + parameter needs to be studied. The MAC protocol may affect the + ability to choose the appropriate delay (either statically or + dynamically). In general, significant variabilities in link-level + retransmission times can have an adverse impact on the performance of + the Delayed Dupacks scheme. Furthermore, as discussed later in + section 4.10.3, Delayed Dupacks and some other schemes (such as Snoop + [SNOOP]) are only beneficial in certain types of network links. + + Recommendation: Delaying duplicate acknowledgements may be useful in + specific network topologies, but a general recommendation requires + further research and experience. + +4.6 Selective Acknowledgements [RFC2018] + + SACK may not be useful in many LTNs, according to Section 1.1 of + [TCPHP]. In particular, SACK is more useful in the LFN regime, + especially if large windows are being used, because there is a + + + + +Montenegro, et al. Informational [Page 18] + +RFC 2757 Long Thin Networks January 2000 + + + considerable probability of multiple segment losses per window. In + the LTN regime, TCP windows are much smaller, and burst errors must + be much longer in duration in order to damage multiple segments. + + Accordingly, the complexity of SACK may not be justifiable, unless + there is a high probability of burst errors and congestion on the + wireless link. A desire for compatibility with TCP recommendations + for non-LTN environments may dictate LTN support for SACK anyway. + + [AGS98] recommends use of SACK with Large TCP Windows in satellite + environments, and notes that this implies support for PAWS + (Protection Against Wrapped Sequence space) and RTTM (Round Trip Time + Measurement) as well. + + Berkeley's SNOOP protocol research [SNOOP] indicates that SACK does + improve throughput for SNOOP when multiple segments are lost per + window [BPSK96]. SACK allows SNOOP to recover from multi-segment + losses in one round-trip. In this case, the mobile device needs to + implement some form of selective acknowledgements. If SACK is not + used, TCP may enter congestion avoidance as the time needed to + retransmit the lost segments may be greater than the retransmission + timer. + + Recommendation: Implement SACK now for compatibility with other TCPs + and improved performance with SNOOP. + +4.7 Detecting Corruption Loss + +4.7.1 Without Explicit Notification + + In the absence of explicit notification from the network, some + researchers have suggested statistical methods for congestion + avoidance [Jain89, WC91, VEGAS]. A natural extension of these + heuristics would enable a sender to distinguish between losses caused + by congestion and other causes. The research results on the + reliability of sender-based heuristics is unfavorable [BV97, BV98]. + [BV98a] reports better results in constrained environments using + packet inter-arrival times measured at the receiver, but highly- + variable delay - of the type encountered in wireless environments + during intercell handoff - confounds these heuristics. + + Recommendation: No recommendation at this time - continue to monitor + research results. + + + + + + + + +Montenegro, et al. Informational [Page 19] + +RFC 2757 Long Thin Networks January 2000 + + +4.7.2 With Explicit Notifications + + With explicit notification from the network it is possible to + determine when a loss is due to congestion. Several proposals along + these lines include: + + - Explicit Loss Notification (ELN) [BPSK96] + + - Explicit Bad State Notification (EBSN) [BBKVP96] + + - Explicit Loss Notification to the Receiver (ELNR), and Explicit + Delayed Dupack Activation Notification (EDDAN) (notifications + to mobile receiver) [MV97] + + - Explicit Congestion Notification (ECN) [ECN] + + Of these proposals, Explicit Congestion Notification (ECN) seems + closest to deployment on the Internet, and will provide some benefit + for TCP connections on long thin networks (as well as for all other + TCP connections). + + Recommendation: No recommendation at this time. Schemes like ELNR and + EDDAN [MV97], in which the only systems that need to be modified are + the intermediate node and the mobile device, are slated for adoption + pending further research. However, this solution has some + limitations. Since the intermediate node must have access to the TCP + headers, the IP payload must not be encrypted. + + ECN uses the TOS byte in the IP header to carry congestion + information (ECN-capable and Congestion-encountered). This byte is + not encrypted in IPSEC, so ECN can be used on TCP connections that + are encrypted using IPSEC. + + Recommendation: Implement ECN. In spite of this, mechanisms for + explicit corruption notification are still relevant and should be + tracked. + + Note: ECN provides useful information to avoid deteriorating further + a bad situation, but has some limitations for wireless applications. + Absence of packets marked with ECN should not be interpreted by ECN- + capable TCP connections as a green light for aggressive + retransmissions. On the contrary, during periods of extreme network + congestion routers may drop packets marked with explicit notification + because their buffers are exhausted - exactly the wrong time for a + host to begin retransmitting aggressively. + + + + + + +Montenegro, et al. Informational [Page 20] + +RFC 2757 Long Thin Networks January 2000 + + +4.8 Active Queue Management + + As has been pointed out above, TCP responds to congestion by closing + down the window and invoking slow start. Long-delay networks take a + particularly long time to recover from this condition. Accordingly, + it is imperative to avoid congestion in LTNs. To remedy this, active + queue management techniques have been proposed as enhancements to + routers throughout the Internet [RED]. The primary motivation for + deployment of these mechanisms is to prevent "congestion collapse" (a + severe degradation in service) by controlling the average queue size + at the routers. As the average queue length grows, Random Early + Detection [RED] increases the possibility of dropping packets. + + The benefits are: + + - Reduce packet drops in routers. By dropping a few packets + before severe congestion sets in, RED avoids dropping bursts of + packets. In other words, the objective is to drop m packets + early to prevent n drops later on, where m is less than n. + + - Provide lower delays. This follows from the smaller queue + sizes, and is particularly important for interactive + applications, for which the inherent delays of wireless links + already push the user experience to the limits of the non- + acceptable. + + - Avoid lock-outs. Lack of resources in a router (and the + resultant packet drops) may, in effect, obliterate throughput + on certain connections. Because of active queue management, it + is more probable for an incoming packet to find available + buffer space at the router. + + Active Queue Management has two components: (1) routers detect + congestion before exhausting their resources, and (2) they provide + some form of congestion indication. Dropping packets via RED is only + one example of the latter. Another way to indicate congestion is to + use ECN [ECN] as discussed above under "Detecting Corruption Loss: + With Explicit Notifications." + + Recommendation: RED is currently being deployed in the Internet, and + LTNs should follow suit. ECN deployment should complement RED's. + +4.9 Scheduling Algorithms + + Active queue management helps control the length of the queues. + Additionally, a general solution requires replacing FIFO with other + scheduling algorithms that improve: + + + + +Montenegro, et al. Informational [Page 21] + +RFC 2757 Long Thin Networks January 2000 + + + 1. Fairness (by policing how different packet streams utilize the + available bandwidth), and + + 2. Throughput (by improving the transmitter's radio channel + utilization). + + For example, fairness is necessary for interactive applications (like + telnet or web browsing) to coexist with bulk transfer sessions. + Proposals here include: + + - Fair Queueing (FQ) [Demers90] + + - Class-based Queueing (CBQ) [Floyd95] + + Even if they are only implemented over the wireless link portion of + the communication path, these proposals are attractive in wireless + LTN environments, because new connections for interactive + applications can have difficulty starting when a bulk TCP transfer + has already stabilized using all available bandwidth. + + In our base architecture described above, the mobile device typically + communicates directly with only one wireless peer at a given time: + the intermediate node. In some W-WANs, it is possible to directly + address other mobiles within the same cell. Direct communication + with each such wireless peer may traverse a spatially distinct path, + each of which may exhibit statistically independent radio link + characteristics. Channel State Dependent Packet Scheduling (CSDP) + [BBKT96] tracks the state of the various radio links (as defined by + the target devices), and gives preferential treatment to packets + destined for radio links in a "good" state. This avoids attempting to + transmit to (and expect acknowledgements from) a peer on a "bad" + radio link, thus improving throughput. + + A further refinement of this idea suggests that both fairness and + throughput can be improved by combining a wireless-enhanced CBQ with + CSDP [FSS98]. + + Recommendation: No recommendation at this time, pending further + study. + +4.10 Split TCP and Performance-Enhancing Proxies (PEPs) + + Given the dramatic differences between the wired and the wireless + links, a very common approach is to provide some impedance matching + where the two different technologies meet: at the intermediate node. + + + + + + +Montenegro, et al. Informational [Page 22] + +RFC 2757 Long Thin Networks January 2000 + + + The idea is to replace an end-to-end TCP connection with two clearly + distinct connections: one across the wireless link, the other across + its wireline counterpart. Each of the two resulting TCP sessions + operates under very different networking characteristics, and may + adopt the policies best suited to its particular medium. For + example, in a specific LTN topology it may be desirable to modify TCP + Fast Retransmit to resend after the first duplicate ack and Fast + Recovery to not shrink the congestion window if the LTN link has an + extremely long RTT, is known to not reorder packets, and is not + subject to congestion. Moreover, on a long-delay link or on a link + with a relatively high bandwidth-delay product it may be desirable to + "slow-start" with a relatively large initial window, even larger than + four segments. While these kinds of TCP modifications can be + negotiated to be employed over the LTN link, they would not be + deployed end-to-end over the global Internet. In LTN topologies where + the underlying link characteristics are known, a various similar + types of performance enhancements can be employed without endangering + operations over the global Internet. + + In some proposals, in addition to a PEP mechanism at the intermediate + node, custom protocols are used on the wireless link (for example, + [WAP], [YB94] or [MOWGLI]). + + Even if the gains from using non-TCP protocols are moderate or + better, the wealth of research on optimizing TCP for wireless, and + compatibility with the Internet are compelling reasons to adopt TCP + on the wireless link (enhanced as suggested in section 5 below). + +4.10.1 Split TCP Approaches + + Split-TCP proposals include schemes like I-TCP [ITCP] and MTCP [YB94] + which achieve performance improvements by abandoning end-to-end + semantics. + + The Mowgli architecture [MOWGLI] proposes a split approach with + support for various enhancements at all the protocol layers, not only + at the transport layer. Mowgli provides an option to replace the + TCP/IP core protocols on the LTN link with a custom protocol that is + tuned for LTN links [KRLKA97]. In addition, the protocol provides + various features that are useful with LTNs. For example, it provides + priority-based multiplexing of concurrent connections together with + shared flow control, thus offering link capacity to interactive + applications in a timely manner even if there are bandwidth-intensive + background transfers. Also with this option, Mowgli preserves the + socket semantics on the mobile device so that legacy applications can + be run unmodified. + + + + + +Montenegro, et al. Informational [Page 23] + +RFC 2757 Long Thin Networks January 2000 + + + Employing split TCP approaches have several benefits as well as + drawbacks. Benefits related to split TCP approaches include the + following: + + - Splitting the end-to-end TCP connection into two parts is a + straightforward way to shield the problems of the wireless link + from the wireline Internet path, and vice versa. Thus, a split TCP + approach enables applying local solutions to the local problems on + the wireless link. For example, it automatically solves the + problem of distinguishing congestion related packet losses on the + wireline Internet and packet losses due to transmission error on + the wireless link as these occur on separate TCP connections. + Even if both segments experience congestion, it may be of a + different nature and may be treated as such. Moreover, temporary + disconnections of the wireless link can be effectively shielded + from the wireline Internet. + + - When one of the TCP connections crosses only a single hop wireless + link or a very limited number of hops, some or all link + characteristics for the wireless TCP path are known. For example, + with a particular link we may know that the link provides reliable + delivery of packets, packets are not delivered out of order, or + the link is not subject to congestion. Having this information for + the TCP path one could expect that defining the TCP mitigations to + be employed becomes a significantly easier task. In addition, + several mitigations that cannot be employed safely over the global + Internet, can be successfully employed over the wireless link. + + - Splitting one TCP connection into two separate ones allows much + earlier deployment of various recent proposals to improve TCP + performance over wireless links; only the TCP implementations of + the mobile device and intermediate node need to be modified, thus + allowing the vast number of Internet hosts to continue running the + legacy TCP implementations unmodified. Any mitigations that would + require modification of TCP in these wireline hosts may take far + too long to become widely deployed. + + - Allows exploitation of various application level enhancements + which may give significant performance gains (see section 4.10.2). + + Drawbacks related to split TCP approaches include the following: + + - One of the main criticisms against the split TCP approaches is + that it breaks TCP end-to-end semantics. This has various + drawbacks some of which are more severe than others. The most + detrimental drawback is probably that splitting the TCP connection + disables end-to-end usage of IP layer security mechanisms, + precluding the application of IPSec to achieve end-to-end + + + +Montenegro, et al. Informational [Page 24] + +RFC 2757 Long Thin Networks January 2000 + + + security. Still, IPSec could be employed separately in each of the + two parts, thus requiring the intermediate node to become a party + to the security association between the mobile device and the + remote host. This, however, is an undesirable or unacceptable + alternative in most cases. Other security mechanisms above the + transport layer, like TLS [RFC2246] or SOCKS [RFC1928], should be + employed for end-to-end security. + + - Another drawback of breaking end-to-end semantics is that crashes + of the intermediate node become unrecoverable resulting in + termination of the TCP connections. Whether this should be + considered a severe problem depends on the expected frequency of + such crashes. + + - In many occasions claims have been stated that if TCP end-to-end + semantics is broken, applications relying on TCP to provide + reliable data delivery become more vulnerable. This, however, is + an overstatement as a well-designed application should never fully + rely on TCP in achieving end-to-end reliability at the application + level. First, current APIs to TCP, such as the Berkeley socket + interface, do not allow applications to know when an TCP + acknowledgement for previously sent user data arrives at TCP + sender. Second, even if the application is informed of the TCP + acknowledgements, the sending application cannot know whether the + receiving application has received the data: it only knows that + the data reached the TCP receive buffer at the receiving end. + Finally, in order to achieve end-to-end reliability at the + application level an application level acknowledgement is required + to confirm that the receiver has taken the appropriate actions on + the data it received. + + - When a mobile device moves, it is subject to handovers by the + serving base station. If the base station acts as the intermediate + node for the split TCP connection, the state of both TCP endpoints + on the previous intermediate node must be transferred to the new + intermediate node to ensure continued operation over the split TCP + connection. This requires extra work and causes overhead. However, + in most of the W-WAN wireless networks, unlike in W-LANs, the W- + WAN base station does not provide the mobile device with the + connection point to the wireline Internet (such base stations may + not even have an IP stack). Instead, the W-WAN network takes care + of the mobility and retains the connection point to the wireline + Internet unchanged while the mobile device moves. Thus, TCP state + handover is not required in most W-WANs. + + - The packets traversing through all the protocol layers up to + transport layer and again down to the link layer result in extra + overhead at the intermediate node. In case of LTNs with low + + + +Montenegro, et al. Informational [Page 25] + +RFC 2757 Long Thin Networks January 2000 + + + bandwidth, this extra overhead does not cause serious additional + performance problems unlike with W-LANs that typically have much + higher bandwidth. + + - Split TCP proposals are not applicable to networks with asymmetric + routing. Deploying a split TCP approach requires that traffic to + and from the mobile device be routed through the intermediate + node. With some networks, this cannot be accomplished, or it + requires that the intermediate node is located several hops away + from the wireless network edge which in turn is unpractical in + many cases and may result in non-optimal routing. + + - Split TCP, as the name implies, does not address problems related + to UDP. + + It should noted that using split TCP does not necessarily exclude + simultaneous usage of IP for end-to-end connectivity. Correct usage + of split TCP should be managed per application or per connection and + should be under the end-user control so that the user can decide + whether a particular TCP connection or application makes use of split + TCP or whether it operates end-to-end directly over IP. + + Recommendation: Split TCP proposals that alter TCP semantics are not + recommended. Deploying custom protocols on the wireless link, such as + MOWGLI proposes is not recommended, because this note gives + preference to (1) improving TCP instead of designing a custom + protocol and (2) allowing end-to-end sessions at all times. + +4.10.2 Application Level Proxies + + Nowadays, application level proxies are widely used in the Internet. + Such proxies include Web proxy caches, relay MTAs (Mail Transfer + Agents), and secure transport proxies (e.g., SOCKS). In effect, + employing an application level proxy results in a "split TCP + connection" with the proxy as the intermediary. Hence, some of the + problems present with wireless links, such as combining of a + congested wide-area Internet path with a wireless LTN link, are + automatically alleviated to some extent. + + The application protocols often employ plenty of (unnecessary) round + trips, lots of headers and inefficient encoding. Even unnecessary + data may get delivered over the wireless link in regular application + protocol operation. In many cases a significant amount of this + overhead can be reduced by simply running an application level proxy + on the intermediate node. With LTN links, significant additional + improvement can be achieved by introducing application level proxies + with application-specific enhancements. Such a proxy may employ an + enhanced version of the application protocol over the wireless link. + + + +Montenegro, et al. Informational [Page 26] + +RFC 2757 Long Thin Networks January 2000 + + + In an LTN environment enhancements at the application layer may + provide much more notable performance improvements than any transport + level enhancements. + + The Mowgli system provides full support for adding application level + agent-proxy pairs between the client and the server, the agent on the + mobile device and the proxy on the intermediate node. Such a pair may + be either explicit or fully transparent to the applications, but it + is, at all times, under the end-user control. Good examples of + enhancements achieved with application-specific proxies include + Mowgli WWW [LAKLR95], [LHKR96] and WebExpress [HL96], [CTCSM97]. + + Recommendation: Usage of application level proxies is conditionally + recommended: an application must be proxy enabled and the decision of + employing a proxy for an application must be under the user control + at all times. + +4.10.3 Snoop and its Derivatives + + Berkeley's SNOOP protocol [SNOOP] is a hybrid scheme mixing link- + layer reliability mechanisms with the split connection approach. It + is an improvement over split TCP approaches in that end-to-end + semantics are retained. SNOOP does two things: + + 1. Locally (on the wireless link) retransmit lost packets, instead + of allowing TCP to do so end-to-end. + + 2. Suppress the duplicate acks on their way from the receiver back + to the sender, thus avoiding fast retransmit and congestion + avoidance at the latter. + + Thus, the Snoop protocol is designed to avoid unnecessary fast + retransmits by the TCP sender, when the wireless link layer + retransmits a packet locally. Consider a system that does not use the + Snoop agent. Consider a TCP sender S that sends packets to receiver R + via an intermediate node IN. Assume that the sender sends packet A, + B, C, D, E (in that order) which are forwarded by IN to the wireless + receiver R. Assume that the intermediate node then retransmits B + subsequently, because the first transmission of packet B is lost due + to errors on the wireless link. In this case, receiver R receives + packets A, C, D, E and B (in that order). Receipt of packets C, D and + E triggers duplicate acknowledgements. When the TCP sender receives + three duplicate acknowledgements, it triggers fast retransmit (which + results in a retransmission, as well as reduction of congestion + window). The fast retransmit occurs despite the link level + retransmit on the wireless link, degrading throughput. + + + + + +Montenegro, et al. Informational [Page 27] + +RFC 2757 Long Thin Networks January 2000 + + + SNOOP [SNOOP] deals with this problem by dropping TCP dupacks + appropriately (at the intermediate node). The Delayed Dupacks (see + section 4.5) attempts to approximate Snoop without requiring + modifications at the intermediate node. Such schemes are needed only + if the possibility of a fast retransmit due to wireless errors is + non-negligible. In particular, if the wireless link uses a stop-and- + go protocol (or otherwise delivers packets in-order), then these + schemes are not very beneficial. Also, if the bandwidth-delay + product of the wireless link is smaller than four segments, the + probability that the intermediate node will have an opportunity to + send three new packets before a lost packet is retransmitted is + small. Since at least three dupacks are needed to trigger a fast + retransmit, with a wireless bandwidth-delay product less than four + packets, schemes such as Snoop and Delayed Dupacks would not be + necessary (unless the link layer is not designed properly). + Conversely, when the wireless bandwidth-delay product is large + enough, Snoop can provide significant performance improvement + (compared with standard TCP). For further discussion on these topics, + please refer to [Vaidya99]. + + The Delayed Dupacks scheme tends to provide performance benefit in + environments where Snoop performs well. In general, performance + improvement achieved by the Delayed Dupacks scheme is a function of + packet loss rates due to congestion and transmission errors. When + congestion-related losses occur, the Delayed Dupacks scheme + unnecessarily delays retransmission. Thus, in the presence of + congestion losses, the Delayed Dupacks scheme cannot achieve the same + performance improvement as Snoop. However, simulation results + [Vaidya99] indicate that the Delayed Dupacks can achieve a + significant improvement in performance despite moderate congestion + losses. + + WTCP [WTCP] is similar to SNOOP in that it preserves end-to-end + semantics. In WTCP, the intermediate node uses a complex scheme to + hide the time it spends recovering from local errors across the + wireless link (this typically includes retransmissions due to error + recovery, but may also include time spent dealing with congestion). + The idea is for the sender to derive a smooth estimate of round-trip + time. In order to work effectively, it assumes that the TCP + endpoints implement the Timestamps option in RFC 1323 [TCPHP]. + Unfortunately, support for RFC 1323 in TCP implementations is not yet + widespread. Beyond this, WTCP requires changes only at the + intermediate node. + + SNOOP and WTCP require the intermediate node to examine and operate + on the traffic between the portable wireless device and the TCP + server on the wired Internet. SNOOP and WTCP do not work if the IP + traffic is encrypted, unless, of course, the intermediate node shares + + + +Montenegro, et al. Informational [Page 28] + +RFC 2757 Long Thin Networks January 2000 + + + the security association between the mobile device and its end-to-end + peer. They also require that both the data and the corresponding + ACKs traverse the same intermediate node. Furthermore, if the + intermediate node retransmits packets at the transport layer across + the wireless link, this may duplicate efforts by the link-layer. + SNOOP has been described by its designers as a TCP-aware link-layer. + This is the right approach: the link and network layers can be much + more aware of each other than traditional OSI layering suggests. + + Encryption of IP packets via IPSEC's ESP header (in either transport + or tunnel mode) renders the TCP header and payload unintelligible to + the intermediate node. This precludes SNOOP (and WTCP) from working, + because it needs to examine the TCP headers in both directions. + Possible solutions involve: + + - making the SNOOP (or WTCP) intermediate node a party to the + security association between the client and the server + + - IPSEC tunneling mode, terminated at the SNOOPing intermediate node + + However, these techniques require that users trust intermediate + nodes. Users valuing both privacy and performance should use SSL or + SOCKS for end-to-end security. These, however, are implemented above + the transport layer, and are not as resistant to some security + attacks (for example, those based on guessing TCP sequence numbers) + as IPSEC. + + Recommendation: Implement SNOOP on intermediate nodes now. Research + results are encouraging, and it is an "invisible" optimization in + that neither the client nor the server needs to change, only the + intermediate node (for basic SNOOP without SACK). However, as + discussed above there is little or no benefit from implementing SNOOP + if: + + 1. The wireless link provides reliable, in-order packet delivery, + or, + + 2. The bandwidth-delay product of the wireless link is smaller + than four segments. + +4.10.4 PEPs to handle Periods of Disconnection + + Periods of disconnection are very common in wireless networks, either + during handoff, due to lack of resources (dropped connections) or + natural obstacles. During these periods, a TCP sender does not + receive the expected acknowledgements. Upon expiration of the + retransmit timer, this causes TCP to close its congestion window + with all the related drawbacks. Re-transmitting packets is useless + + + +Montenegro, et al. Informational [Page 29] + +RFC 2757 Long Thin Networks January 2000 + + + since the connection is broken. [M-TCP] aims at enabling TCP to + better handle handoffs and periods of disconnection, while preserving + end-to-end semantics. M-TCP adds an element: supervisor host (SH- + TCP) at the edge of the wireless network. + + This intermediate node monitors the traffic coming from the sender to + the mobile device. It does not break end-to-end semantics because the + ACKs sent from the intermediate node to the sender are effectively + the ones sent by the mobile node. The principle is to generally leave + the last byte unacknowledged. Hence, SH-TCP could shut down the + sender's window by sending the ACK for the last byte with a window + set to zero. Thus the sender will go to persist mode. + + The second optimization is done on both the intermediate node and the + mobile host. On the latter, TCP is aware of the current state of the + connection. In the event of a disconnection, it is capable of + freezing all timers. Upon reconnection, the mobile sends a specially + marked ACK with the number of the highest byte received. The + intermediate node assumes that the mobile is disconnected because it + monitors the flow on the wireless link, so in the absence of + acknowledgments from the mobile, it will inform SH-TCP, which will + send the ACK closing the sender window as described in the previous + paragraph. The intermediate node learns that the mobile is again + connected when it receives a duplicate acknowledgment marked as + reconnected. At this point it sends a duplicate ACK to the sender + and grows the window. The sender exits persist mode and resumes + transmitting at the same rate as before. It begins by retransmitting + any data previously unacknowledged by the mobile node. Non + overlapping or non soft handoffs are lightweight because the previous + intermediate system can shrink the window, and the new one modifies + it as soon as it has received an indication from the mobile. + + Recommendation: M-TCP is not slated for adoption at this moment, + because of the highly experimental nature of the proposal, and the + uncertainty that TCP/IP implementations handle zero window updates + correctly. Continue tracking developments in this space. + +4.11 Header Compression Alternatives + + Because Long Thin Networks are bandwidth-constrained, compressing + every byte out of over-the-air segments is worth while. + + Mechanisms for TCP and IP header compression defined in [RFC1144, + IPHC, IPHC-RTP, IPHC-PPP] provide the following benefits: + + - Improve interactive response time + + - Allow using small packets for bulk data with good line efficiency + + + +Montenegro, et al. Informational [Page 30] + +RFC 2757 Long Thin Networks January 2000 + + + - Allow using small packets for delay sensitive low data-rate + traffic + + - Decrease header overhead (for a common TCP segment size of 512 + the header overhead of IPv4/TCP within a Mobile IP tunnel can + decrease from 11.7 to less than 1 per cent. + + - Reduce packet loss rate over lossy links (because of the + smaller cross-section of compressed packets). + + Van Jacobson (VJ) header compression [RFC1144] describes a Proposed + Standard for TCP Header compression that is widely deployed. It uses + TCP timeouts to detect a loss of synchronization between the + compressor and decompressor. [IPHC] includes an explicit request for + transmission of uncompressed headers to allow resynchronization + without waiting for a TCP timeout (and executing congestion avoidance + procedures). + + Recommendation: Implement [IPHC], in particular as it relates to IP- + in-IP [RFC2003] and Minimal Encapsulation [RFC2004] for Mobile IP, as + well as TCP header compression for lossy links and links that + reorder packets. PPP capable devices should implement [IPHC-PPP]. VJ + header compression may optionally be implemented as it is a widely + deployed Proposed Standard. However, it should only be enabled when + operating over reliable LTNs, because even a single bit error most + probably would result in a full TCP window being dropped, followed by + a costly recovery via slow-start. + +4.12 Payload Compression + + Compression of IP payloads is also desirable. "IP Payload Compression + Protocol (IPComp)" [IPPCP] defines a framework where common + compression algorithms can be applied to arbitrary IP segment + payloads. IP payload compression is something of a niche + optimization. It is necessary because IP-level security converts IP + payloads to random bitstreams, defeating commonly-deployed link-layer + compression mechanisms which are faced with payloads that have no + redundant "information" that can be more compactly represented. + + However, many IP payloads are already compressed (images, audio, + video, "zipped" files being FTPed), or are already encrypted above + the IP layer (SSL/TLS, etc.). These payloads will not "compress" + further, limiting the benefit of this optimization. + + HTTP/1.1 already supports compression of the message body. For + example, to use zlib compression the relevant directives are: + "Content-Encoding: deflate" and "Accept-Encoding: deflate" [HTTP- + PERF]. + + + +Montenegro, et al. Informational [Page 31] + +RFC 2757 Long Thin Networks January 2000 + + + HTTP-NG is considering supporting compression of resources at the + HTTP level, which would provide equivalent benefits for common + compressible MIME types like text/html. This will reduce the need for + IPComp. If IPComp is deployed more rapidly than HTTP-NG, IPComp + compression of HTML and MIME headers would be beneficial. + + In general, application-level compression can often outperform + IPComp, because of the opportunity to use compression dictionaries + based on knowledge of the specific data being compressed. + + Recommendation: IPComp may optionally be implemented. Track HTTP-NG + standardization and deployment for now. Implementing HTTP/1.1 + compression using zlib SHOULD is recommended. + +4.13 TCP Control Block Interdependence [Touch97] + + TCP maintains per-connection information such as connection state, + current round-trip time, congestion control or maximum segment size. + Sharing information between two consecutive connections or when + creating a new connection while the first is still active to the same + host may improve performance of the latter connection. The principle + could easily be extended to sharing information amongst systems in a + LAN not just within a given system. [Touch97] describes cache update + for both cases. + + Users of W-WAN devices frequently request connections to the same + servers or set of servers. For example, in order to read their email + or to initiate connections to other servers, the devices may be + configured to always use the same email server or WWW proxy. The + main advantage of this proposal is that it relieves the application + of the burden of optimizing the transport layer. In order to improve + the performance of TCP connections, this mechanism only requires + changes at the wireless device. + + In general, this scheme should improve the dynamism of connection + setup without increasing the cost of the implementation. + + Recommendation: This mechanism is recommended, although HTTP/1.1 with + its persistent connections may partially achieve the same effect + without it. Other applications (even HTTP/1.0) may find it useful. + Continue monitoring research on this. In particular, work on a + "Congestion Manager" [CM] may generalize this concept of sharing + information among protocols and applications with a view to making + them more adaptable to network conditions. + + + + + + + +Montenegro, et al. Informational [Page 32] + +RFC 2757 Long Thin Networks January 2000 + + +5 Summary of Recommended Optimizations + + The table below summarizes our recommendations with regards to the + main proposals mentioned above. + + The first column, "Stability of the Proposal," refers to the maturity + of the mechanism in question. Some proposals are being pursued + within the IETF in a somewhat open fashion. An IETF proposal is + either an Internet Drafts (I-D) or a Request for Comments (RFC). The + former is a preliminary version. There are several types of RFCs. A + Draft Standards (DS) is standards track, and carries more weight than + a Proposed Standard (PS), which may still undergo revisions. + Informational or Experimental RFCs do not specify a standard. Other + proposals are isolated efforts with little or no public review, and + unknown chances of garnering industry backing. + + "Implemented at" indicates which participant in a TCP session must be + modified to implement the proposal. Legacy servers typically cannot + be modified, so this column indicates whether implementation happens + at either or both of the two nodes under some control: mobile device + and intermediate node. The symbols used are: WS (wireless sender, + that is, the mobile device's TCP send operation must be modified), WR + (wireless receiver, that is, the mobile device's TCP receive + operation must be modified), WD (wireless device, that is, + modifications at the mobile device are not specific to either TCP + send or receive), IN (intermediate node) and NI (network + infrastructure). These entities are to be understood within the + context of Section 1.1 ("Network Architecture"). NA simply means "not + applicable." + + The "Recommendation" column captures our suggestions. Some + mechanisms are endorsed for immediate adoption, others need more + evidence and research, and others are not recommended. + +Name Stability of Implemented Recommendation + the Proposal at +==================== ============= =========== ================= + +Increased Initial RFC 2581 (PS) WS Yes +Window (initial_window=2) + +Disable delayed ACKs NA WR When stable +during slow start + +Byte counting NA WS No +instead of ACK +counting + + + + +Montenegro, et al. Informational [Page 33] + +RFC 2757 Long Thin Networks January 2000 + + +TCP Header RFC 1144 (PS) WD Yes +compression for PPP IN (see 4.11) + +IP Payload RFC 2393 (PS) WD Yes +Compression (simultaneously +(IPComp) needed on Server) + +Header RFC 2507 (PS), WD Yes +Compression RFC 2509 (PS) IN (For IPv4, TCP and + Mobile IP, PPP) + +SNOOP plus SACK In limited use IN Yes + WD (for SACK) + +Fast retransmit/fast RFC 2581 (PS) WD Yes (should be +recovery there already) + +Transaction/TCP RFC 1644 WD No + (Experimental) (simultaneously + needed on Server) + +Estimating Slow NA WS No +Start Threshold +(ssthresh) + +Delayed Duplicate Not stable WR When stable +Acknowledgements IN (for + notifications) + +Class-based Queuing NA WD When stable +on End Systems + +Explicit Congestion RFC 2481 (EXP) WD Yes + +Notification NI + +TCP Control Block RFC 2140 WD Yes +Interdependence (Informational) (Track research) + + + Of all the optimizations in the table above, only SNOOP plus SACK and + Delayed duplicate acknowledgements are currently being proposed only + for wireless networks. The others are being considered even for non- + wireless applications. Their more general applicability attracts more + attention and analysis from the research community. + + Of the above mechanisms, only Header Compression (for IP and TCP) and + "SNOOP plus SACK" cease to work in the presence of IPSec. + + + +Montenegro, et al. Informational [Page 34] + +RFC 2757 Long Thin Networks January 2000 + + +6 Conclusion + + In view of the unpredictable and problematic nature of long thin + networks, arriving at an optimized transport is a daunting task. We + have reviewed the existing proposals along with future research + items. Based on this overview, we also recommend mechanisms for + implementation in long thin networks (LTNs). + +7 Acknowledgements + + The authors are deeply indebted to the IETF tcpsat and tcpimpl + working groups. The following individuals have also provided valuable + feedback: Mark Allman (NASA), Vern Paxson (ACIRI), Raphi Rom + (Technion/Sun), Charlie Perkins (Nokia), Peter Stark (Phone.com). + +8 Security Considerations + + The mechanisms discussed and recommended in this document have been + proposed in previous publications. The security considerations + outlined in the original discussions apply here as well. Several + security issues are also discussed throughout this document. + Additionally, we present below a non-exhaustive list of the most + salient issues concerning our recommended mechanisms: + + - Larger Initial TCP Window Size + + No known security issues [RFC2414, RFC2581]. + + - Header Compression + + May be open to some denial of service attacks. But any attacker in + a position to launch these attacks would have much stronger + attacks at his disposal [IPHC, IPHC-RTP]. + + - Congestion Control, Fast Retransmit/Fast Recovery + + An attacker may force TCP connections to grind to a halt, or, more + dangerously, behave more aggressively. The latter possibility may + lead to congestion collapse, at least in some regions of the + network [RFC2581]. + + - Explicit Congestion Notification + + It does not appear to increase the vulnerabilities in the network. + On the contrary, it may reduce them by aiding in the + identification of flows unresponsive to or non-compliant with TCP + congestion control [ECN]. + + + + +Montenegro, et al. Informational [Page 35] + +RFC 2757 Long Thin Networks January 2000 + + + - Sharing of Network Performance Information (TCP Control Block + Sharing and Congestion Manager module) + + Some information should not be shared. For example, TCP sequence + numbers are used to protect against spoofing attacks. Even + limiting the sharing to performance values leaves open the + possibility of denial-of-service attacks [Touch97]. + + - Performance Enhancing Proxies + + These systems are men-in-the-middle from the point of view of + their security vulnerabilities. Accordingly, they must be used + with extreme care so as to prevent their being hijacked and + misused. + + This last point is not to be underestimated: there is a general + security concern whenever an intermediate node performs operations + different from those carried out in an end-to-end basis. This is not + specific to performance-enhancing proxies. In particular, there may + be a tendency to forego IPSEC-based privacy in order to allow, for + example, a SNOOP module, header compression (TCP, UDP, RTP, etc), or + HTTP proxies to work. + + Adding end-to-end security at higher layers (for example via RTP + encryption, or via TLS encryption of the TCP payload) alleviates the + problem. However, this still leaves protocol headers in the clear, + and these may be exploited for traffic analysis and denial-of-service + attacks. + +9 References + + [ACKSPACING] Partridge, C., "ACK Spacing for High Delay-Bandwidth + Paths with Insufficient Buffering", Work in Progress. + + [ADGGHOSSTT98] Allman, M., Dawkins, S., Glover, D., Griner, J., + Henderson, T., Heidemann, J., Kruse, H., Osterman, S., + Scott, K., Semke, J., Touch, J. and D. Tran, "Ongoing + TCP Research Related to Satellites", Work in Progress. + + [AGS98] Allman, M., Glover, D. and L. Sanchez, "Enhancing TCP + Over Satellite Channels using Standard Mechanisms", + BCP 28, RFC 2488, January 1999. + + + + + + + + + +Montenegro, et al. Informational [Page 36] + +RFC 2757 Long Thin Networks January 2000 + + + [Allman98] Mark Allman. On the Generation and Use of TCP + Acknowledgments. ACM Computer Communication Review, + 28(5), October 1998. + + [AHO98] Allman, M., Hayes, C., Ostermann, S., "An Evaluation + of TCP with Larger Initial Windows," Computer + Communication Review, 28(3), July 1998. + + [BBKT96] Bhagwat, P., Bhattacharya, P., Krishna, A., Tripathi, + S., "Enhancing Throughput over Wireless LANs Using + Channel State Dependent Packet Scheduling," in Proc. + IEEE INFOCOM'96, pp. 1133-40, March 1996. + + [BBKVP96] Bakshi, B., P., Krishna, N., Vaidya, N., Pradhan, + D.K., "Improving Performance of TCP over Wireless + Networks," Technical Report 96-014, Texas A&M + University, 1996. + + [BPSK96] Balakrishnan, H., Padmanabhan, V., Seshan, S., Katz, + R., "A Comparison of Mechanisms for Improving TCP + Performance over Wireless Links," in ACM SIGCOMM, + Stanford, California, August 1996. + + [BPK99] Balakrishnan, H., Padmanabhan, V., Katz, R., "The + effects of asymmetry on TCP performance," ACM Mobile + Networks and Applications (MONET), Vol. 4, No. 3, + 1999, pp. 219-241. + + [BV97] S. Biaz and N. H. Vaidya, "Distinguishing Congestion + Losses from Wireless Transmission Losses: A Negative + Result," Seventh International Conference on Computer + Communications and Networks (IC3N), New Orleans, + October 1998. + + [BV98] Biaz, S., Vaidya, N., "Sender-Based heuristics for + Distinguishing Congestion Losses from Wireless + Transmission Losses," Texas A&M University, Technical + Report 98-013, June 1998. + + [BV98a] Biaz, S., Vaidya, N., "Discriminating Congestion + Losses from Wireless Losses using Inter-Arrival Times + at the Receiver," Texas A&M University, Technical + Report 98-014, June 1998. + + [BW97] Brasche, G., Walke, B., "Concepts, Services, and + Protocols of the New GSM Phase 2+ general Packet Radio + Service," IEEE Communications Magazine, Vol. 35, No. + 8, August 1997. + + + +Montenegro, et al. Informational [Page 37] + +RFC 2757 Long Thin Networks January 2000 + + + [CB96] Cheshire, S., Baker, M., "Experiences with a Wireless + Network in MosquitoNet," IEEE Micro, February 1996. + Available online as: + http://rescomp.stanford.edu/~cheshire/papers + /wireless.ps. + + [CDMA] Electronic Industry Alliance(EIA)/Telecommunications + Industry Association (TIA), IS-95: Mobile Station-Base + Station Compatibility Standard for Dual-Mode Wideband + Spread Spectrum Cellular System, 1993. + + [CDPD] Wireless Data Forum, CDPD System Specification, + Release 1.1, 1995. + + [CM] Hari Balakrishnan and Srinivasan Seshan, "The + Congestion Manager," Work in Progress. + + [CTCSM97] Chang, H., Tait, C., Cohen, N., Shapiro, M., + Mastrianni, S., Floyd, R., Housel, B., Lindquist, D., + "Web Browsing in a Wireless Environment: Disconnected + and Asynchronous Operation in ARTour Web Express," in + Proc. MobiCom'97, Budapest, Hungary, September 1997. + + [Demers90] Demers, A., Keshav, S., and Shenker, S., Analysis and + Simulation of a Fair Queueing Algorithm, + Internetworking: Research and Experience, Vol. 1, + 1990, pp. 3-26. + + [ECN] Ramakrishnan, K. and S. Floyd, "A Proposal to add + Explicit Congestion Notification (ECN) to IP", RFC + 2481, January 1999. + + [Floyd95] Floyd, S., and Jacobson, V., Link-sharing and Resource + Management Models for Packet Networks. IEEE/ACM + Transactions on Networking, Vol. 3 No. 4, pp. 365-386, + August 1995. + + [FSS98] Fragouli, C., Sivaraman, V., Srivastava, M., + "Controlled Multimedia Wireless Link Sharing via + Enhanced Class-Based Queueing with Channel-State- + Dependent Packet Scheduling," Proc. IEEE INFOCOM'98, + April 1998. + + [GPRS] ETSI, "General Packet Radio Service (GPRS): Service + Description, Stage 2," GSM03.60, v.6.1.1 August 1998. + + + + + + +Montenegro, et al. Informational [Page 38] + +RFC 2757 Long Thin Networks January 2000 + + + [GSM] Rahnema, M., "Overview of the GSM system and protocol + architecture," IEEE Communications Magazine, vol. 31, + pp 92-100, April 1993. + + [HL96] Hausel, B., Lindquist, D., "WebExpress: A System for + Optimizing Web Browsing in a Wireless Environment," in + Proc. MobiCom'96, Rye, New York, USA, November 1996. + + [HTTP-PERF] Henrik Frystyk Nielsen (W3C, MIT), Jim Gettys (W3C, + Digital), Anselm Baird-Smith (W3C, INRIA), Eric + Prud'hommeaux (W3C, MIT), Hon Lie (W3C, INRIA), Chris + Lilley (W3C, INRIA), "Network Performance Effects of + HTTP/1.1, CSS1, and PNG," ACM SIGCOMM '97, Cannes, + France, September 1997. Available at: + http://www.w3.org/Protocols/HTTP/Performance + /Pipeline.html + + [IPPCP] Shacham, A., Monsour, R., Pereira, R. and M. Thomas, + "IP Payload Compression Protocol (IPComp)", RFC 2393, + December 1998. + + [IPHC] Degermark, M., Nordgren, B. and S. Pink, "IP Header + Compression", RFC 2507, February 1999. + + [IPHC-RTP] Casner, S. and V. Jacobson, "Compressing IP/UDP/RTP + Headers for Low-Speed Serial Links", RFC 2508, + February 1999. + + [IPHC-PPP] Engan, M., Casner, S. and C. Bormann, "IP Header + Compression over PPP", RFC 2509, February 1999. + + [ITCP] Bakre, A., Badrinath, B.R., "Handoff and Systems + Support for Indirect TCP/IP. In Proceedings of the + Second USENIX Symposium on Mobile and Location- + Independent Computing, Ann Arbor, Michigan, April 10- + 11, 1995. + + [Jain89] Jain, R., "A Delay-Based Approach for Congestion + Avoidance in Interconnected Heterogeneous Computer + Networks," Digital Equipment Corporation, Technical + Report DEC-TR-566, April 1989. + + [Karn93] Karn, P., "The Qualcomm CDMA Digital Cellular System" + Proc. USENIX Mobile and Location-Independent Computing + Symposium, USENIX Association, August 1993. + + + + + + +Montenegro, et al. Informational [Page 39] + +RFC 2757 Long Thin Networks January 2000 + + + [KRLKA97] Kojo, M., Raatikainen, K., Liljeberg, M., Kiiskinen, + J., Alanko, T., "An Efficient Transport Service for + Slow Wireless Telephone Links," in IEEE Journal on + Selected Areas of Communication, volume 15, number 7, + September 1997. + + [LAKLR95] Liljeberg, M., Alanko, T., Kojo, M., Laamanen, H., + Raatikainen, K., "Optimizing World-Wide Web for + Weakly-Connected Mobile Workstations: An Indirect + Approach," in Proc. 2nd Int. Workshop on Services in + Distributed and Networked Environments, Whistler, + Canada, pp. 132-139, June 1995. + + [LHKR96] Liljeberg, M., Helin, H., Kojo, M., Raatikainen, K., + "Mowgli WWW Software: Improved Usability of WWW in + Mobile WAN Environments," in Proc. IEEE Global + Internet 1996 Conference, London, UK, November 1996. + + [LS98] Lettieri, P., Srivastava, M., "Adaptive Frame Length + Control for Improving Wireless Link Throughput, Range, + and Energy Efficiency," Proc. IEEE INFOCOM'98, April + 1998. + + [MNCP] Piscitello, D., Phifer, L., Wang, Y., Hovey, R., + "Mobile Network Computing Protocol (MNCP)", Work in + Progress. + + [MOWGLI] Kojo, M., Raatikainen, K., Alanko, T., "Connecting + Mobile Workstations to the Internet over a Digital + Cellular Telephone Network," in Proc. Workshop on + Mobile and Wireless Information Systems (MOBIDATA), + Rutgers University, NJ, November 1994. Available at: + http://www.cs.Helsinki.FI/research/mowgli/. Revised + version published in Mobile Computing, pp. 253-270, + Kluwer, 1996. + + [MSMO97] Mathis, M., Semke, J., Mahdavi, J., Ott, T., "The + Macroscopic Behavior of the TCP Congestion Avoidance + Algorithm," in Computer Communications Review, a + publication of ACM SIGCOMM, volume 27, number 3, July + 1997. + + [MTCP] Brown, K. Singh, S., "A Network Architecture for + Mobile Computing," Proc. IEEE INFOCOM'96, pp. 1388- + 1396, March 1996. Available at + ftp://ftp.ece.orst.edu/pub/singh/papers + /transport.ps.gz + + + + +Montenegro, et al. Informational [Page 40] + +RFC 2757 Long Thin Networks January 2000 + + + [M-TCP] Brown, K. Singh, S., "M-TCP: TCP for Mobile Cellular + Networks," ACM Computer Communications Review Vol. + 27(5), 1997. Available at + ftp://ftp.ece.orst.edu/pub/singh/papers/mtcp.ps.gz + + [MV97] Mehta, M., Vaidya, N., "Delayed Duplicate- + Acknowledgements: A Proposal to Improve Performance + of TCP on Wireless Links," Texas A&M University, + December 24, 1997. Available at + http://www.cs.tamu.edu/faculty/vaidya/mobile.html + + [NETBLT] White, J., "NETBLT (Network Block Transfer Protocol)", + Work in Progress. + + [Paxson97] V. Paxson, "End-to-End Internet Packet Dynamics," + Proc. SIGCOMM '97. Available at + ftp://ftp.ee.lbl.gov/papers/vp-pkt-dyn-sigcomm97.ps.Z + + [RED] Braden, B., Clark, D., Crowcroft, J., Davie, B., + Deering, S., Estrin, D., Floyd, S., Jacobson, V., + Minshall, G., Partridge, C., Peterson, L., + Ramakrishnan, K., Shenker, S., Wroclawski, J. and L. + Zhang, "Recommendations on Queue Management and + Congestion Avoidance in the Internet", RFC 2309, April + 1998. + + [RLP] ETSI, "Radio Link Protocol for Data and Telematic + Services on the Mobile Station - Base Station System + (MS-BSS) interface and the Base Station System - + Mobile Switching Center (BSS-MSC) interface," GSM + Specification 04.22, Version 3.7.0, February 1992. + + [RFC908] Velten, D., Hinden, R. and J. Sax, "Reliable Data + Protocol", RFC 908, July 1984. + + [RFC1030] Lambert, M., "On Testing the NETBLT Protocol over + Divers Networks", RFC 1030, November 1987. + + [RFC1122] Braden, R., "Requirements for Internet Hosts -- + Communication Layers", STD 3, RFC 1122, October 1989. + + [RFC1144] Jacobson, V., "Compressing TCP/IP Headers for Low- + Speed Serial Links", RFC 1144, February 1990. + + [RFC1151] Partridge, C., Hinden, R., "Version 2 of the Reliable + Data Protocol (RDP)", RFC 1151, April 1990. + + + + + +Montenegro, et al. Informational [Page 41] + +RFC 2757 Long Thin Networks January 2000 + + + [RFC1191] Mogul, J. and S. Deering, "Path MTU Discovery", RFC + 1191, November 1990. + + [RFC1397] Braden, R., "Extending TCP for Transactions -- + Concepts", RFC 1397, November 1992. + + [RFC1644] Braden, R., "T/TCP -- TCP Extensions for Transactions + Functional Specification", RFC 1644, July 1994. + + [RFC1661] Simpson, W., "The Point-To-Point Protocol (PPP)", STD + 51, RFC 1661, July 1994. + + [RFC1928] Leech, M., Ganis, M., Lee, Y., Kuris, R., Koblas, D. + and L. Jones, "SOCKS Protocol Version 5", RFC 1928, + March 1996. + + [RFC1986] Polites, W., Wollman, W., Woo, D. and R. Langan, + "Experiments with a Simple File Transfer Protocol for + Radio Links using Enhanced Trivial File Transfer + Protocol (ETFTP)", RFC 1986, August 1996. + + [RFC2002] Perkins, C., "IP Mobility Support", RFC 2002, October + 1996. + + [RFC2003] Perkins, C., "IP Encapsulation within IP", RFC 2003, + October 1996. + + [RFC2004] Perkins, C., "Minimal Encapsulation within IP", RFC + 2004, October 1996. + + [RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, + "TCP Selective Acknowledgment Options", RFC 2018, + October 1996. + + [RFC2188] Banan, M., Taylor, M. and J. Cheng, "AT&T/Neda's + Efficient Short Remote Operations (ESRO) Protocol + Specification Version 1.2", RFC 2188, September 1997. + + [RFC2246] Dierk, T. and E. Allen, "TLS Protocol Version 1", RFC + 2246, January 1999. + + [RFC2414] Allman, M., Floyd, S. and C. Partridge. "Increasing + TCP's Initial Window", RFC 2414, September 1998. + + [RFC2415] Poduri, K.and K. Nichols, "Simulation Studies of + Increased Initial TCP Window Size", RFC 2415, + September 1998. + + + + +Montenegro, et al. Informational [Page 42] + +RFC 2757 Long Thin Networks January 2000 + + + [RFC2416] Shepard, T. and C. Partridge, "When TCP Starts Up With + Four Packets Into Only Three Buffers", RFC 2416, + September 1998. + + [RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion + Control", RFC 2581, April 1999. + + [RFC2582] Floyd, S. and T. Henderson, "The NewReno Modification + to TCP's Fast Recovery Algorithm", RFC 2582, April + 1999. + + [SNOOP] Balakrishnan, H., Seshan, S., Amir, E., Katz, R., + "Improving TCP/IP Performance over Wireless Networks," + Proc. 1st ACM Conf. on Mobile Computing and Networking + (Mobicom), Berkeley, CA, November 1995. + + [Stevens94] R. Stevens, "TCP/IP Illustrated, Volume 1," Addison- + Wesley, 1994 (section 2.10 for MTU size considerations + and section 11.3 for weak checksums). + + [TCPHP] Jacobson, V., Braden, R. and D. Borman, "TCP + Extensions for High Performance", RFC 1323, May 1992. + + [TCPSATMIN] TCPSAT Minutes, August, 1997. Available at: + http://tcpsat.lerc.nasa.gov/tcpsat/meetings/munich- + minutes.txt. + + [Touch97] Touch, T., "TCP Control Block Interdependence", RFC + 2140, April 1997. + + [Vaidya99] N. H. Vaidya, M. Mehta, C. Perkins, G. Montenegro, + "Delayed Duplicate Acknowledgements: A TCP-Unaware + Approach to Improve Performance of TCP over Wireless," + Technical Report 99-003, Computer Science Dept., Texas + A&M University, February 1999. + + [VEGAS] Brakmo, L., O'Malley, S., "TCP Vegas, New Techniques + for Congestion Detection and Avoidance," SIGCOMM'94, + London, pp 24-35, October 1994. + + [VMTP] Cheriton, D., "VMTP: Versatile Message Transaction + Protocol", RFC 1045, February 1988. + + [WAP] Wireless Application Protocol Forum. + http://www.wapforum.org/ + + + + + + +Montenegro, et al. Informational [Page 43] + +RFC 2757 Long Thin Networks January 2000 + + + [WC91] Wang, Z., Crowcroft, J., "A New Congestion Control + Scheme: Slow Start and Search," ACM Computer + Communication Review, vol 21, pp 32-43, January 1991. + + [WTCP] Ratnam, K., Matta, I., "WTCP: An Efficient + Transmission Control Protocol for Networks with + Wireless Links," Technical Report NU-CCS-97-11, + Northeastern University, July 1997. Available at: + http://www.ece.neu.edu/personal/karu/papers/WTCP- + NU.ps.gz + + [YB94] Yavatkar, R., Bhagawat, N., "Improving End-to-End + Performance of TCP over Mobile Internetworks," Proc. + Workshop on Mobile Computing Systems and Applications, + IEEE Computer Society Press, Los Alamitos, California, + 1994. + +Authors' Addresses + + Questions about this document may be directed at: + + Gabriel E. Montenegro + Sun Labs Networking and Security Group + Sun Microsystems, Inc. + 901 San Antonio Road + Mailstop UMPK 15-214 + Mountain View, California 94303 + + Phone: +1-650-786-6288 + Fax: +1-650-786-6445 + EMail: gab@sun.com + + + Spencer Dawkins + Nortel Networks + P.O. Box 833805 + Richardson, Texas 75083-3805 + + Phone: +1-972-684-4827 + Fax: +1-972-685-3292 + EMail: sdawkins@nortel.com + + + + + + + + + + +Montenegro, et al. Informational [Page 44] + +RFC 2757 Long Thin Networks January 2000 + + + Markku Kojo + Department of Computer Science + University of Helsinki + P.O. Box 26 (Teollisuuskatu 23) + FIN-00014 HELSINKI + Finland + + Phone: +358-9-1914-4179 + Fax: +358-9-1914-4441 + EMail: kojo@cs.helsinki.fi + + + Vincent Magret + Corporate Research Center + Alcatel Network Systems, Inc + 1201 Campbell + Mail stop 446-310 + Richardson Texas 75081 USA + M/S 446-310 + + Phone: +1-972-996-2625 + Fax: +1-972-996-5902 + EMail: vincent.magret@aud.alcatel.com + + + Nitin Vaidya + Dept. of Computer Science + Texas A&M University + College Station, TX 77843-3112 + + Phone: 979-845-0512 + Fax: 979-847-8578 + EMail: vaidya@cs.tamu.edu + + + + + + + + + + + + + + + + + + +Montenegro, et al. Informational [Page 45] + +RFC 2757 Long Thin Networks January 2000 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2000). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + + + + + + + + + + + +Montenegro, et al. Informational [Page 46] + diff --git a/ext/picotcp/RFC/rfc2760.txt b/ext/picotcp/RFC/rfc2760.txt new file mode 100644 index 0000000..5779d8d --- /dev/null +++ b/ext/picotcp/RFC/rfc2760.txt @@ -0,0 +1,2579 @@ + + + + + + +Network Working Group M. Allman, Editor +Request for Comments: 2760 NASA Glenn Research Center/BBN Technologies +Category: Informational S. Dawkins + Nortel + D. Glover + J. Griner + D. Tran + NASA Glenn Research Center + T. Henderson + University of California at Berkeley + J. Heidemann + J. Touch + University of Southern California/ISI + H. Kruse + S. Ostermann + Ohio University + K. Scott + The MITRE Corporation + J. Semke + Pittsburgh Supercomputing Center + February 2000 + + + Ongoing TCP Research Related to Satellites + + +Status of this Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard of any kind. Distribution of this + memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2000). All Rights Reserved. + +Abstract + + This document outlines possible TCP enhancements that may allow TCP + to better utilize the available bandwidth provided by networks + containing satellite links. The algorithms and mechanisms outlined + have not been judged to be mature enough to be recommended by the + IETF. The goal of this document is to educate researchers as to the + current work and progress being done in TCP research related to + satellite networks. + + + + + + +Allman, et al. Informational [Page 1] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +Table of Contents + + 1 Introduction. . . . . . . . . . . . . . . . . . . . 2 + 2 Satellite Architectures . . . . . . . . . . . . . . 3 + 2.1 Asymmetric Satellite Networks . . . . . . . . . . . 3 + 2.2 Satellite Link as Last Hop. . . . . . . . . . . . . 3 + 2.3 Hybrid Satellite Networks . . . . . . . . . . . 4 + 2.4 Point-to-Point Satellite Networks . . . . . . . . . 4 + 2.5 Multiple Satellite Hops . . . . . . . . . . . . . . 4 + 3 Mitigations . . . . . . . . . . . . . . . . . . . . 4 + 3.1 TCP For Transactions. . . . . . . . . . . . . . . . 4 + 3.2 Slow Start. . . . . . . . . . . . . . . . . . . . . 5 + 3.2.1 Larger Initial Window . . . . . . . . . . . . . . . 6 + 3.2.2 Byte Counting . . . . . . . . . . . . . . . . . . . 7 + 3.2.3 Delayed ACKs After Slow Start . . . . . . . . . . . 9 + 3.2.4 Terminating Slow Start. . . . . . . . . . . . . . . 11 + 3.3 Loss Recovery . . . . . . . . . . . . . . . . . . . 12 + 3.3.1 Non-SACK Based Mechanisms . . . . . . . . . . . . . 12 + 3.3.2 SACK Based Mechanisms . . . . . . . . . . . . . . . 13 + 3.3.3 Explicit Congestion Notification. . . . . . . . . . 16 + 3.3.4 Detecting Corruption Loss . . . . . . . . . . . . . 18 + 3.4 Congestion Avoidance. . . . . . . . . . . . . . . . 21 + 3.5 Multiple Data Connections . . . . . . . . . . . . . 22 + 3.6 Pacing TCP Segments . . . . . . . . . . . . . . . . 24 + 3.7 TCP Header Compression. . . . . . . . . . . . . . . 26 + 3.8 Sharing TCP State Among Similar Connections . . . . 29 + 3.9 ACK Congestion Control. . . . . . . . . . . . . . . 32 + 3.10 ACK Filtering . . . . . . . . . . . . . . . . . . . 34 + 4 Conclusions . . . . . . . . . . . . . . . . . . . . 36 + 5 Security Considerations . . . . . . . . . . . . . . 36 + 6 Acknowledgments . . . . . . . . . . . . . . . . . . 37 + 7 References. . . . . . . . . . . . . . . . . . . . . 37 + 8 Authors' Addresses. . . . . . . . . . . . . . . . . 43 + 9 Full Copyright Statement. . . . . . . . . . . . . . 46 + +1 Introduction + + This document outlines mechanisms that may help the Transmission + Control Protocol (TCP) [Pos81] better utilize the bandwidth provided + by long-delay satellite environments. These mechanisms may also help + in other environments or for other protocols. The proposals outlined + in this document are currently being studied throughout the research + community. Therefore, these mechanisms are not mature enough to be + recommended for wide-spread use by the IETF. However, some of these + mechanisms may be safely used today. It is hoped that this document + will stimulate further study into the described mechanisms. If, at + + + + + +Allman, et al. Informational [Page 2] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + some point, the mechanisms discussed in this memo prove to be safe + and appropriate to be recommended for general use, the appropriate + IETF documents will be written. + + It should be noted that non-TCP mechanisms that help performance over + satellite links do exist (e.g., application-level changes, queueing + disciplines, etc.). However, outlining these non-TCP mitigations is + beyond the scope of this document and therefore is left as future + work. Additionally, there are a number of mitigations to TCP's + performance problems that involve very active intervention by + gateways along the end-to-end path from the sender to the receiver. + Documenting the pros and cons of such solutions is also left as + future work. + +2 Satellite Architectures + + Specific characteristics of satellite links and the impact these + characteristics have on TCP are presented in RFC 2488 [AGS99]. This + section discusses several possible topologies where satellite links + may be integrated into the global Internet. The mitigation outlined + in section 3 will include a discussion of which environment the + mechanism is expected to benefit. + +2.1 Asymmetric Satellite Networks + + Some satellite networks exhibit a bandwidth asymmetry, a larger data + rate in one direction than the reverse direction, because of limits + on the transmission power and the antenna size at one end of the + link. Meanwhile, some other satellite systems are unidirectional and + use a non-satellite return path (such as a dialup modem link). The + nature of most TCP traffic is asymmetric with data flowing in one + direction and acknowledgments in opposite direction. However, the + term asymmetric in this document refers to different physical + capacities in the forward and return links. Asymmetry has been shown + to be a problem for TCP [BPK97,BPK98]. + +2.2 Satellite Link as Last Hop + + Satellite links that provide service directly to end users, as + opposed to satellite links located in the middle of a network, may + allow for specialized design of protocols used over the last hop. + Some satellite providers use the satellite link as a shared high + speed downlink to users with a lower speed, non-shared terrestrial + link that is used as a return link for requests and acknowledgments. + Many times this creates an asymmetric network, as discussed above. + + + + + + +Allman, et al. Informational [Page 3] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +2.3 Hybrid Satellite Networks + + In the more general case, satellite links may be located at any point + in the network topology. In this case, the satellite link acts as + just another link between two gateways. In this environment, a given + connection may be sent over terrestrial links (including terrestrial + wireless), as well as satellite links. On the other hand, a + connection could also travel over only the terrestrial network or + only over the satellite portion of the network. + +2.4 Point-to-Point Satellite Networks + + In point-to-point satellite networks, the only hop in the network is + over the satellite link. This pure satellite environment exhibits + only the problems associated with the satellite links, as outlined in + [AGS99]. Since this is a private network, some mitigations that are + not appropriate for shared networks can be considered. + +2.5 Multiple Satellite Hops + + In some situations, network traffic may traverse multiple satellite + hops between the source and the destination. Such an environment + aggravates the satellite characteristics described in [AGS99]. + +3 Mitigations + + The following sections will discuss various techniques for mitigating + the problems TCP faces in the satellite environment. Each of the + following sections will be organized as follows: First, each + mitigation will be briefly outlined. Next, research work involving + the mechanism in question will be briefly discussed. Next the + implementation issues of the mechanism will be presented (including + whether or not the particular mechanism presents any dangers to + shared networks). Then a discussion of the mechanism's potential + with regard to the topologies outlined above is given. Finally, the + relationships and possible interactions with other TCP mechanisms are + outlined. The reader is expected to be familiar with the TCP + terminology used in [AGS99]. + +3.1 TCP For Transactions + +3.1.1 Mitigation Description + + TCP uses a three-way handshake to setup a connection between two + hosts [Pos81]. This connection setup requires 1-1.5 round-trip times + (RTTs), depending upon whether the data sender started the connection + actively or passively. This startup time can be eliminated by using + TCP extensions for transactions (T/TCP) [Bra94]. After the first + + + +Allman, et al. Informational [Page 4] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + connection between a pair of hosts is established, T/TCP is able to + bypass the three-way handshake, allowing the data sender to begin + transmitting data in the first segment sent (along with the SYN). + This is especially helpful for short request/response traffic, as it + saves a potentially long setup phase when no useful data is being + transmitted. + +3.1.2 Research + + T/TCP is outlined and analyzed in [Bra92,Bra94]. + +3.1.3 Implementation Issues + + T/TCP requires changes in the TCP stacks of both the data sender and + the data receiver. While T/TCP is safe to implement in shared + networks from a congestion control perspective, several security + implications of sending data in the first data segment have been + identified [ddKI99]. + +3.1.4 Topology Considerations + + It is expected that T/TCP will be equally beneficial in all + environments outlined in section 2. + +3.1.5 Possible Interaction and Relationships with Other Research + + T/TCP allows data transfer to start more rapidly, much like using a + larger initial congestion window (see section 3.2.1), delayed ACKs + after slow start (section 3.2.3) or byte counting (section 3.2.2). + +3.2 Slow Start + + The slow start algorithm is used to gradually increase the size of + TCP's congestion window (cwnd) [Jac88,Ste97,APS99]. The algorithm is + an important safe-guard against transmitting an inappropriate amount + of data into the network when the connection starts up. However, + slow start can also waste available network capacity, especially in + long-delay networks [All97a,Hay97]. Slow start is particularly + inefficient for transfers that are short compared to the + delay*bandwidth product of the network (e.g., WWW transfers). + + Delayed ACKs are another source of wasted capacity during the slow + start phase. RFC 1122 [Bra89] suggests data receivers refrain from + ACKing every incoming data segment. However, every second full-sized + segment should be ACKed. If a second full-sized segment does not + arrive within a given timeout, an ACK must be generated (this timeout + cannot exceed 500 ms). Since the data sender increases the size of + cwnd based on the number of arriving ACKs, reducing the number of + + + +Allman, et al. Informational [Page 5] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + ACKs slows the cwnd growth rate. In addition, when TCP starts + sending, it sends 1 segment. When using delayed ACKs a second + segment must arrive before an ACK is sent. Therefore, the receiver + is always forced to wait for the delayed ACK timer to expire before + ACKing the first segment, which also increases the transfer time. + + Several proposals have suggested ways to make slow start less time + consuming. These proposals are briefly outlined below and references + to the research work given. + +3.2.1 Larger Initial Window + +3.2.1.1 Mitigation Description + + One method that will reduce the amount of time required by slow start + (and therefore, the amount of wasted capacity) is to increase the + initial value of cwnd. An experimental TCP extension outlined in + [AFP98] allows the initial size of cwnd to be increased from 1 + segment to that given in equation (1). + + min (4*MSS, max (2*MSS, 4380 bytes)) (1) + + By increasing the initial value of cwnd, more packets are sent during + the first RTT of data transmission, which will trigger more ACKs, + allowing the congestion window to open more rapidly. In addition, by + sending at least 2 segments initially, the first segment does not + need to wait for the delayed ACK timer to expire as is the case when + the initial size of cwnd is 1 segment (as discussed above). + Therefore, the value of cwnd given in equation 1 saves up to 3 RTTs + and a delayed ACK timeout when compared to an initial cwnd of 1 + segment. + + Also, we note that RFC 2581 [APS99], a standards-track document, + allows a TCP to use an initial cwnd of up to 2 segments. This change + is highly recommended for satellite networks. + +3.2.1.2 Research + + Several researchers have studied the use of a larger initial window + in various environments. [Nic97] and [KAGT98] show a reduction in + WWW page transfer time over hybrid fiber coax (HFC) and satellite + links respectively. Furthermore, it has been shown that using an + initial cwnd of 4 segments does not negatively impact overall + performance over dialup modem links with a small number of buffers + [SP98]. [AHO98] shows an improvement in transfer time for 16 KB + files across the Internet and dialup modem links when using a larger + initial value for cwnd. However, a slight increase in dropped + + + + +Allman, et al. Informational [Page 6] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + segments was also shown. Finally, [PN98] shows improved transfer + time for WWW traffic in simulations with competing traffic, in + addition to a small increase in the drop rate. + +3.2.1.3 Implementation Issues + + The use of a larger initial cwnd value requires changes to the + sender's TCP stack. Using an initial congestion window of 2 segments + is allowed by RFC 2581 [APS99]. Using an initial congestion window + of 3 or 4 segments is not expected to present any danger of + congestion collapse [AFP98], however may degrade performance in some + networks. + +3.2.1.4 Topology Considerations + + It is expected that the use of a large initial window would be + equally beneficial to all network architectures outlined in section + 2. + +3.2.1.5 Possible Interaction and Relationships with Other Research + + Using a fixed larger initial congestion window decreases the impact + of a long RTT on transfer time (especially for short transfers) at + the cost of bursting data into a network with unknown conditions. A + mechanism that mitigates bursts may make the use of a larger initial + congestion window more appropriate (e.g., limiting the size of line- + rate bursts [FF96] or pacing the segments in a burst [VH97a]). + + Also, using delayed ACKs only after slow start (as outlined in + section 3.2.3) offers an alternative way to immediately ACK the first + segment of a transfer and open the congestion window more rapidly. + Finally, using some form of TCP state sharing among a number of + connections (as discussed in 3.8) may provide an alternative to using + a fixed larger initial window. + +3.2.2 Byte Counting + +3.2.2.1 Mitigation Description + + As discussed above, the wide-spread use of delayed ACKs increases the + time needed by a TCP sender to increase the size of the congestion + window during slow start. This is especially harmful to flows + traversing long-delay GEO satellite links. One mechanism that has + been suggested to mitigate the problems caused by delayed ACKs is the + use of "byte counting", rather than standard ACK counting + [All97a,All98]. Using standard ACK counting, the congestion window + is increased by 1 segment for each ACK received during slow start. + However, using byte counting the congestion window increase is based + + + +Allman, et al. Informational [Page 7] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + on the number of previously unacknowledged bytes covered by each + incoming ACK, rather than on the number of ACKs received. This makes + the increase relative to the amount of data transmitted, rather than + being dependent on the ACK interval used by the receiver. + + Two forms of byte counting are studied in [All98]. The first is + unlimited byte counting (UBC). This mechanism simply uses the number + of previously unacknowledged bytes to increase the congestion window + each time an ACK arrives. The second form is limited byte counting + (LBC). LBC limits the amount of cwnd increase to 2 segments. This + limit throttles the size of the burst of data sent in response to a + "stretch ACK" [Pax97]. Stretch ACKs are acknowledgments that cover + more than 2 segments of previously unacknowledged data. Stretch ACKs + can occur by design [Joh95] (although this is not standard), due to + implementation bugs [All97b,PADHV99] or due to ACK loss. [All98] + shows that LBC prevents large line-rate bursts when compared to UBC, + and therefore offers fewer dropped segments and better performance. + In addition, UBC causes large bursts during slow start based loss + recovery due to the large cumulative ACKs that can arrive during loss + recovery. The behavior of UBC during loss recovery can cause large + decreases in performance and [All98] strongly recommends UBC not be + deployed without further study into mitigating the large bursts. + + Note: The standards track RFC 2581 [APS99] allows a TCP to use byte + counting to increase cwnd during congestion avoidance, however not + during slow start. + +3.2.2.2 Research + + Using byte counting, as opposed to standard ACK counting, has been + shown to reduce the amount of time needed to increase the value of + cwnd to an appropriate size in satellite networks [All97a]. In + addition, [All98] presents a simulation comparison of byte counting + and the standard cwnd increase algorithm in uncongested networks and + networks with competing traffic. This study found that the limited + form of byte counting outlined above can improve performance, while + also increasing the drop rate slightly. + + [BPK97,BPK98] also investigated unlimited byte counting in + conjunction with various ACK filtering algorithms (discussed in + section 3.10) in asymmetric networks. + + + + + + + + + + +Allman, et al. Informational [Page 8] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.2.2.3 Implementation Issues + + Changing from ACK counting to byte counting requires changes to the + data sender's TCP stack. Byte counting violates the algorithm for + increasing the congestion window outlined in RFC 2581 [APS99] (by + making congestion window growth more aggressive during slow start) + and therefore should not be used in shared networks. + +3.2.2.4 Topology Considerations + + It has been suggested by some (and roundly criticized by others) that + byte counting will allow TCP to provide uniform cwnd increase, + regardless of the ACKing behavior of the receiver. In addition, byte + counting also mitigates the retarded window growth provided by + receivers that generate stretch ACKs because of the capacity of the + return link, as discussed in [BPK97,BPK98]. Therefore, this change + is expected to be especially beneficial to asymmetric networks. + +3.2.2.5 Possible Interaction and Relationships with Other Research + + Unlimited byte counting should not be used without a method to + mitigate the potentially large line-rate bursts the algorithm can + cause. Also, LBC may send bursts that are too large for the given + network conditions. In this case, LBC may also benefit from some + algorithm that would lessen the impact of line-rate bursts of + segments. Also note that using delayed ACKs only after slow start + (as outlined in section 3.2.3) negates the limited byte counting + algorithm because each ACK covers only one segment during slow start. + Therefore, both ACK counting and byte counting yield the same + increase in the congestion window at this point (in the first RTT). + +3.2.3 Delayed ACKs After Slow Start + +3.2.3.1 Mitigation Description + + As discussed above, TCP senders use the number of incoming ACKs to + increase the congestion window during slow start. And, since delayed + ACKs reduce the number of ACKs returned by the receiver by roughly + half, the rate of growth of the congestion window is reduced. One + proposed solution to this problem is to use delayed ACKs only after + the slow start (DAASS) phase. This provides more ACKs while TCP is + aggressively increasing the congestion window and less ACKs while TCP + is in steady state, which conserves network resources. + + + + + + + + +Allman, et al. Informational [Page 9] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.2.3.2 Research + + [All98] shows that in simulation, using delayed ACKs after slow start + (DAASS) improves transfer time when compared to a receiver that + always generates delayed ACKs. However, DAASS also slightly + increases the loss rate due to the increased rate of cwnd growth. + +3.2.3.3 Implementation Issues + + The major problem with DAASS is in the implementation. The receiver + has to somehow know when the sender is using the slow start + algorithm. The receiver could implement a heuristic that attempts to + watch the change in the amount of data being received and change the + ACKing behavior accordingly. Or, the sender could send a message (a + flipped bit in the TCP header, perhaps) indicating that it was using + slow start. The implementation of DAASS is, therefore, an open + issue. + + Using DAASS does not violate the TCP congestion control specification + [APS99]. However, the standards (RFC 2581 [APS99]) currently + recommend using delayed acknowledgments and DAASS goes (partially) + against this recommendation. + +3.2.3.4 Topology Considerations + + DAASS should work equally well in all scenarios presented in section + 2. However, in asymmetric networks it may aggravate ACK congestion + in the return link, due to the increased number of ACKs (see sections + 3.9 and 3.10 for a more detailed discussion of ACK congestion). + +3.2.3.5 Possible Interaction and Relationships with Other Research + + DAASS has several possible interactions with other proposals made in + the research community. DAASS can aggravate congestion on the path + between the data receiver and the data sender due to the increased + number of returning acknowledgments. This can have an especially + adverse effect on asymmetric networks that are prone to experiencing + ACK congestion. As outlined in sections 3.9 and 3.10, several + mitigations have been proposed to reduce the number of ACKs that are + passed over a low-bandwidth return link. Using DAASS will increase + the number of ACKs sent by the receiver. The interaction between + DAASS and the methods for reducing the number of ACKs is an open + research question. Also, as noted in section 3.2.1.5 above, DAASS + provides some of the same benefits as using a larger initial + congestion window and therefore it may not be desirable to use both + mechanisms together. However, this remains an open question. + Finally, DAASS and limited byte counting are both used to increase + + + + +Allman, et al. Informational [Page 10] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + the rate at which the congestion window is opened. The DAASS + algorithm substantially reduces the impact limited byte counting has + on the rate of congestion window increase. + +3.2.4 Terminating Slow Start + +3.2.4.1 Mitigation Description + + The initial slow start phase is used by TCP to determine an + appropriate congestion window size for the given network conditions + [Jac88]. Slow start is terminated when TCP detects congestion, or + when the size of cwnd reaches the size of the receiver's advertised + window. Slow start is also terminated if cwnd grows beyond a certain + size. The threshold at which TCP ends slow start and begins using + the congestion avoidance algorithm is called "ssthresh" [Jac88]. In + most implementations, the initial value for ssthresh is the + receiver's advertised window. During slow start, TCP roughly doubles + the size of cwnd every RTT and therefore can overwhelm the network + with at most twice as many segments as the network can handle. By + setting ssthresh to a value less than the receiver's advertised + window initially, the sender may avoid overwhelming the network with + twice the appropriate number of segments. Hoe [Hoe96] proposes using + the packet-pair algorithm [Kes91] and the measured RTT to determine a + more appropriate value for ssthresh. The algorithm observes the + spacing between the first few returning ACKs to determine the + bandwidth of the bottleneck link. Together with the measured RTT, + the delay*bandwidth product is determined and ssthresh is set to this + value. When TCP's cwnd reaches this reduced ssthresh, slow start is + terminated and transmission continues using congestion avoidance, + which is a more conservative algorithm for increasing the size of the + congestion window. + +3.2.4.2 Research + + It has been shown that estimating ssthresh can improve performance + and decrease packet loss in simulations [Hoe96]. However, obtaining + an accurate estimate of the available bandwidth in a dynamic network + is very challenging, especially attempting to do so on the sending + side of the TCP connection [AP99]. Therefore, before this mechanism + is widely deployed, bandwidth estimation must be studied in a more + detail. + +3.2.4.3 Implementation Issues + + As outlined in [Hoe96], estimating ssthresh requires changes to the + data sender's TCP stack. As suggested in [AP99], bandwidth estimates + may be more accurate when taken by the TCP receiver, and therefore + both sender and receiver changes would be required. Estimating + + + +Allman, et al. Informational [Page 11] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + ssthresh is safe to implement in production networks from a + congestion control perspective, as it can only make TCP more + conservative than outlined in RFC 2581 [APS99] (assuming the TCP + implementation is using an initial ssthresh of infinity as allowed by + [APS99]). + +3.2.4.4 Topology Considerations + + It is expected that this mechanism will work equally well in all + symmetric topologies outlined in section 2. However, asymmetric + links pose a special problem, as the rate of the returning ACKs may + not be the bottleneck bandwidth in the forward direction. This can + lead to the sender setting ssthresh too low. Premature termination + of slow start can hurt performance, as congestion avoidance opens + cwnd more conservatively. Receiver-based bandwidth estimators do not + suffer from this problem. + +3.2.4.5 Possible Interaction and Relationships with Other Research + + Terminating slow start at the right time is useful to avoid multiple + dropped segments. However, using a selective acknowledgment-based + loss recovery scheme (as outlined in section 3.3.2) can drastically + improve TCP's ability to quickly recover from multiple lost segments + Therefore, it may not be as important to terminate slow start before + a large loss event occurs. [AP99] shows that using delayed + acknowledgments [Bra89] reduces the effectiveness of sender-side + bandwidth estimation. Therefore, using delayed ACKs only during slow + start (as outlined in section 3.2.3) may make bandwidth estimation + more feasible. + +3.3 Loss Recovery + +3.3.1 Non-SACK Based Mechanisms + +3.3.1.1 Mitigation Description + + Several similar algorithms have been developed and studied that + improve TCP's ability to recover from multiple lost segments in a + window of data without relying on the (often long) retransmission + timeout. These sender-side algorithms, known as NewReno TCP, do not + depend on the availability of selective acknowledgments (SACKs) + [MMFR96]. + + These algorithms generally work by updating the fast recovery + algorithm to use information provided by "partial ACKs" to trigger + retransmissions. A partial ACK covers some new data, but not all + data outstanding when a particular loss event starts. For instance, + consider the case when segment N is retransmitted using the fast + + + +Allman, et al. Informational [Page 12] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + retransmit algorithm and segment M is the last segment sent when + segment N is resent. If segment N is the only segment lost, the ACK + elicited by the retransmission of segment N would be for segment M. + If, however, segment N+1 was also lost, the ACK elicited by the + retransmission of segment N will be N+1. This can be taken as an + indication that segment N+1 was lost and used to trigger a + retransmission. + +3.3.1.2 Research + + Hoe [Hoe95,Hoe96] introduced the idea of using partial ACKs to + trigger retransmissions and showed that doing so could improve + performance. [FF96] shows that in some cases using partial ACKs to + trigger retransmissions reduces the time required to recover from + multiple lost segments. However, [FF96] also shows that in some + cases (many lost segments) relying on the RTO timer can improve + performance over simply using partial ACKs to trigger all + retransmissions. [HK99] shows that using partial ACKs to trigger + retransmissions, in conjunction with SACK, improves performance when + compared to TCP using fast retransmit/fast recovery in a satellite + environment. Finally, [FH99] describes several slightly different + variants of NewReno. + +3.3.1.3 Implementation Issues + + Implementing these fast recovery enhancements requires changes to the + sender-side TCP stack. These changes can safely be implemented in + production networks and are allowed by RFC 2581 [APS99]. + +3.3.1.4 Topology Considerations + + It is expected that these changes will work well in all environments + outlined in section 2. + +3.3.1.5 Possible Interaction and Relationships with Other Research + + See section 3.3.2.2.5. + +3.3.2 SACK Based Mechanisms + +3.3.2.1 Fast Recovery with SACK + +3.3.2.1.1 Mitigation Description + + Fall and Floyd [FF96] describe a conservative extension to the fast + recovery algorithm that takes into account information provided by + selective acknowledgments (SACKs) [MMFR96] sent by the receiver. The + algorithm starts after fast retransmit triggers the resending of a + + + +Allman, et al. Informational [Page 13] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + segment. As with fast retransmit, the algorithm cuts cwnd in half + when a loss is detected. The algorithm keeps a variable called + "pipe", which is an estimate of the number of outstanding segments in + the network. The pipe variable is decremented by 1 segment for each + duplicate ACK that arrives with new SACK information. The pipe + variable is incremented by 1 for each new or retransmitted segment + sent. A segment may be sent when the value of pipe is less than cwnd + (this segment is either a retransmission per the SACK information or + a new segment if the SACK information indicates that no more + retransmits are needed). + + This algorithm generally allows TCP to recover from multiple segment + losses in a window of data within one RTT of loss detection. Like + the forward acknowledgment (FACK) algorithm described below, the SACK + information allows the pipe algorithm to decouple the choice of when + to send a segment from the choice of what segment to send. + + [APS99] allows the use of this algorithm, as it is consistent with + the spirit of the fast recovery algorithm. + +3.3.2.1.2 Research + + [FF96] shows that the above described SACK algorithm performs better + than several non-SACK based recovery algorithms when 1--4 segments + are lost from a window of data. [AHKO97] shows that the algorithm + improves performance over satellite links. Hayes [Hay97] shows the + in certain circumstances, the SACK algorithm can hurt performance by + generating a large line-rate burst of data at the end of loss + recovery, which causes further loss. + +3.3.2.1.3 Implementation Issues + + This algorithm is implemented in the sender's TCP stack. However, it + relies on SACK information generated by the receiver. This algorithm + is safe for shared networks and is allowed by RFC 2581 [APS99]. + +3.3.2.1.4 Topology Considerations + + It is expected that the pipe algorithm will work equally well in all + scenarios presented in section 2. + +3.3.2.1.5 Possible Interaction and Relationships with Other Research + + See section 3.3.2.2.5. + + + + + + + +Allman, et al. Informational [Page 14] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.3.2.2 Forward Acknowledgments + +3.3.2.2.1 Mitigation Description + + The Forward Acknowledgment (FACK) algorithm [MM96a,MM96b] was + developed to improve TCP congestion control during loss recovery. + FACK uses TCP SACK options to glean additional information about the + congestion state, adding more precise control to the injection of + data into the network during recovery. FACK decouples the congestion + control algorithms from the data recovery algorithms to provide a + simple and direct way to use SACK to improve congestion control. Due + to the separation of these two algorithms, new data may be sent + during recovery to sustain TCP's self-clock when there is no further + data to retransmit. + + The most recent version of FACK is Rate-Halving [MM96b], in which one + packet is sent for every two ACKs received during recovery. + Transmitting a segment for every-other ACK has the result of reducing + the congestion window in one round trip to half of the number of + packets that were successfully handled by the network (so when cwnd + is too large by more than a factor of two it still gets reduced to + half of what the network can sustain). Another important aspect of + FACK with Rate-Halving is that it sustains the ACK self-clock during + recovery because transmitting a packet for every-other ACK does not + require half a cwnd of data to drain from the network before + transmitting, as required by the fast recovery algorithm + [Ste97,APS99]. + + In addition, the FACK with Rate-Halving implementation provides + Thresholded Retransmission to each lost segment. "Tcprexmtthresh" is + the number of duplicate ACKs required by TCP to trigger a fast + retransmit and enter recovery. FACK applies thresholded + retransmission to all segments by waiting until tcprexmtthresh SACK + blocks indicate that a given segment is missing before resending the + segment. This allows reasonable behavior on links that reorder + segments. As described above, FACK sends a segment for every second + ACK received during recovery. New segments are transmitted except + when tcprexmtthresh SACK blocks have been observed for a dropped + segment, at which point the dropped segment is retransmitted. + + [APS99] allows the use of this algorithm, as it is consistent with + the spirit of the fast recovery algorithm. + +3.3.2.2.2 Research + + The original FACK algorithm is outlined in [MM96a]. The algorithm + was later enhanced to include Rate-Halving [MM96b]. The real-world + performance of FACK with Rate-Halving was shown to be much closer to + + + +Allman, et al. Informational [Page 15] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + the theoretical maximum for TCP than either TCP Reno or the SACK- + based extensions to fast recovery outlined in section 3.3.2.1 + [MSMO97]. + +3.3.2.2.3 Implementation Issues + + In order to use FACK, the sender's TCP stack must be modified. In + addition, the receiver must be able to generate SACK options to + obtain the full benefit of using FACK. The FACK algorithm is safe + for shared networks and is allowed by RFC 2581 [APS99]. + +3.3.2.2.4 Topology Considerations + + FACK is expected to improve performance in all environments outlined + in section 2. Since it is better able to sustain its self-clock than + TCP Reno, it may be considerably more attractive over long delay + paths. + +3.3.2.2.5 Possible Interaction and Relationships with Other Research + + Both SACK based loss recovery algorithms described above (the fast + recovery enhancement and the FACK algorithm) are similar in that they + attempt to effectively repair multiple lost segments from a window of + data. Which of the SACK-based loss recovery algorithms to use is + still an open research question. In addition, these algorithms are + similar to the non-SACK NewReno algorithm described in section 3.3.1, + in that they attempt to recover from multiple lost segments without + reverting to using the retransmission timer. As has been shown, the + above SACK based algorithms are more robust than the NewReno + algorithm. However, the SACK algorithm requires a cooperating TCP + receiver, which the NewReno algorithm does not. A reasonable TCP + implementation might include both a SACK-based and a NewReno-based + loss recovery algorithm such that the sender can use the most + appropriate loss recovery algorithm based on whether or not the + receiver supports SACKs. Finally, both SACK-based and non-SACK-based + versions of fast recovery have been shown to transmit a large burst + of data upon leaving loss recovery, in some cases [Hay97]. + Therefore, the algorithms may benefit from some burst suppression + algorithm. + +3.3.3 Explicit Congestion Notification + +3.3.3.1 Mitigation Description + + Explicit congestion notification (ECN) allows routers to inform TCP + senders about imminent congestion without dropping segments. Two + major forms of ECN have been studied. A router employing backward + ECN (BECN), transmits messages directly to the data originator + + + +Allman, et al. Informational [Page 16] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + informing it of congestion. IP routers can accomplish this with an + ICMP Source Quench message. The arrival of a BECN signal may or may + not mean that a TCP data segment has been dropped, but it is a clear + indication that the TCP sender should reduce its sending rate (i.e., + the value of cwnd). The second major form of congestion notification + is forward ECN (FECN). FECN routers mark data segments with a + special tag when congestion is imminent, but forward the data + segment. The data receiver then echos the congestion information + back to the sender in the ACK packet. A description of a FECN + mechanism for TCP/IP is given in [RF99]. + + As described in [RF99], senders transmit segments with an "ECN- + Capable Transport" bit set in the IP header of each packet. If a + router employing an active queueing strategy, such as Random Early + Detection (RED) [FJ93,BCC+98], would otherwise drop this segment, an + "Congestion Experienced" bit in the IP header is set instead. Upon + reception, the information is echoed back to TCP senders using a bit + in the TCP header. The TCP sender adjusts the congestion window just + as it would if a segment was dropped. + + The implementation of ECN as specified in [RF99] requires the + deployment of active queue management mechanisms in the affected + routers. This allows the routers to signal congestion by sending TCP + a small number of "congestion signals" (segment drops or ECN + messages), rather than discarding a large number of segments, as can + happen when TCP overwhelms a drop-tail router queue. + + Since satellite networks generally have higher bit-error rates than + terrestrial networks, determining whether a segment was lost due to + congestion or corruption may allow TCP to achieve better performance + in high BER environments than currently possible (due to TCP's + assumption that all loss is due to congestion). While not a solution + to this problem, adding an ECN mechanism to TCP may be a part of a + mechanism that will help achieve this goal. See section 3.3.4 for a + more detailed discussion of differentiating between corruption and + congestion based losses. + +3.3.3.2 Research + + [Flo94] shows that ECN is effective in reducing the segment loss rate + which yields better performance especially for short and interactive + TCP connections. Furthermore, [Flo94] also shows that ECN avoids + some unnecessary, and costly TCP retransmission timeouts. Finally, + [Flo94] also considers some of the advantages and disadvantages of + various forms of explicit congestion notification. + + + + + + +Allman, et al. Informational [Page 17] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.3.3.3 Implementation Issues + + Deployment of ECN requires changes to the TCP implementation on both + sender and receiver. Additionally, deployment of ECN requires + deployment of some active queue management infrastructure in routers. + RED is assumed in most ECN discussions, because RED is already + identifying segments to drop, even before its buffer space is + exhausted. ECN simply allows the delivery of "marked" segments while + still notifying the end nodes that congestion is occurring along the + path. ECN is safe (from a congestion control perspective) for shared + networks, as it maintains the same TCP congestion control principles + as are used when congestion is detected via segment drops. + +3.3.3.4 Topology Considerations + + It is expected that none of the environments outlined in section 2 + will present a bias towards or against ECN traffic. + +3.3.3.5 Possible Interaction and Relationships with Other Research + + Note that some form of active queueing is necessary to use ECN (e.g., + RED queueing). + +3.3.4 Detecting Corruption Loss + + Differentiating between congestion (loss of segments due to router + buffer overflow or imminent buffer overflow) and corruption (loss of + segments due to damaged bits) is a difficult problem for TCP. This + differentiation is particularly important because the action that TCP + should take in the two cases is entirely different. In the case of + corruption, TCP should merely retransmit the damaged segment as soon + as its loss is detected; there is no need for TCP to adjust its + congestion window. On the other hand, as has been widely discussed + above, when the TCP sender detects congestion, it should immediately + reduce its congestion window to avoid making the congestion worse. + + TCP's defined behavior, as motivated by [Jac88,Jac90] and defined in + [Bra89,Ste97,APS99], is to assume that all loss is due to congestion + and to trigger the congestion control algorithms, as defined in + [Ste97,APS99]. The loss may be detected using the fast retransmit + algorithm, or in the worst case is detected by the expiration of + TCP's retransmission timer. + + TCP's assumption that loss is due to congestion rather than + corruption is a conservative mechanism that prevents congestion + collapse [Jac88,FF98]. Over satellite networks, however, as in many + wireless environments, loss due to corruption is more common than on + terrestrial networks. One common partial solution to this problem is + + + +Allman, et al. Informational [Page 18] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + to add Forward Error Correction (FEC) to the data that's sent over + the satellite/wireless link. A more complete discussion of the + benefits of FEC can be found in [AGS99]. However, given that FEC + does not always work or cannot be universally applied, other + mechanisms have been studied to attempt to make TCP able to + differentiate between congestion-based and corruption-based loss. + + TCP segments that have been corrupted are most often dropped by + intervening routers when link-level checksum mechanisms detect that + an incoming frame has errors. Occasionally, a TCP segment containing + an error may survive without detection until it arrives at the TCP + receiving host, at which point it will almost always either fail the + IP header checksum or the TCP checksum and be discarded as in the + link-level error case. Unfortunately, in either of these cases, it's + not generally safe for the node detecting the corruption to return + information about the corrupt packet to the TCP sender because the + sending address itself might have been corrupted. + +3.3.4.1 Mitigation Description + + Because the probability of link errors on a satellite link is + relatively greater than on a hardwired link, it is particularly + important that the TCP sender retransmit these lost segments without + reducing its congestion window. Because corrupt segments do not + indicate congestion, there is no need for the TCP sender to enter a + congestion avoidance phase, which may waste available bandwidth. + Simulations performed in [SF98] show a performance improvement when + TCP can properly differentiate between between corruption and + congestion of wireless links. + + Perhaps the greatest research challenge in detecting corruption is + getting TCP (a transport-layer protocol) to receive appropriate + information from either the network layer (IP) or the link layer. + Much of the work done to date has involved link-layer mechanisms that + retransmit damaged segments. The challenge seems to be to get these + mechanisms to make repairs in such a way that TCP understands what + happened and can respond appropriately. + +3.3.4.2 Research + + Research into corruption detection to date has focused primarily on + making the link level detect errors and then perform link-level + retransmissions. This work is summarized in [BKVP97,BPSK96]. One of + the problems with this promising technique is that it causes an + effective reordering of the segments from the TCP receiver's point of + view. As a simple example, if segments A B C D are sent across a + noisy link and segment B is corrupted, segments C and D may have + already crossed the link before B can be retransmitted at the link + + + +Allman, et al. Informational [Page 19] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + level, causing them to arrive at the TCP receiver in the order A C D + B. This segment reordering would cause the TCP receiver to generate + duplicate ACKs upon the arrival of segments C and D. If the + reordering was bad enough, the sender would trigger the fast + retransmit algorithm in the TCP sender, in response to the duplicate + ACKs. Research presented in [MV98] proposes the idea of suppressing + or delaying the duplicate ACKs in the reverse direction to counteract + this behavior. Alternatively, proposals that make TCP more robust in + the face of re-ordered segment arrivals [Flo99] may reduce the side + effects of the re-ordering caused by link-layer retransmissions. + + A more high-level approach, outlined in the [DMT96], uses a new + "corruption experienced" ICMP error message generated by routers that + detect corruption. These messages are sent in the forward direction, + toward the packet's destination, rather than in the reverse direction + as is done with ICMP Source Quench messages. Sending the error + messages in the forward direction allows this feedback to work over + asymmetric paths. As noted above, generating an error message in + response to a damaged packet is problematic because the source and + destination addresses may not be valid. The mechanism outlined in + [DMT96] gets around this problem by having the routers maintain a + small cache of recent packet destinations; when the router + experiences an error rate above some threshold, it sends an ICMP + corruption-experienced message to all of the destinations in its + cache. Each TCP receiver then must return this information to its + respective TCP sender (through a TCP option). Upon receiving an ACK + with this "corruption-experienced" option, the TCP sender assumes + that packet loss is due to corruption rather than congestion for two + round trip times (RTT) or until it receives additional link state + information (such as "link down", source quench, or additional + "corruption experienced" messages). Note that in shared networks, + ignoring segment loss for 2 RTTs may aggravate congestion by making + TCP unresponsive. + +3.3.4.3 Implementation Issues + + All of the techniques discussed above require changes to at least the + TCP sending and receiving stacks, as well as intermediate routers. + Due to the concerns over possibly ignoring congestion signals (i.e., + segment drops), the above algorithm is not recommended for use in + shared networks. + +3.3.4.4 Topology Considerations + + It is expected that corruption detection, in general would be + beneficial in all environments outlined in section 2. It would be + particularly beneficial in the satellite/wireless environment over + which these errors may be more prevalent. + + + +Allman, et al. Informational [Page 20] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.3.4.5 Possible Interaction and Relationships with Other Research + + SACK-based loss recovery algorithms (as described in 3.3.2) may + reduce the impact of corrupted segments on mostly clean links because + recovery will be able to happen more rapidly (and without relying on + the retransmission timer). Note that while SACK-based loss recovery + helps, throughput will still suffer in the face of non-congestion + related packet loss. + +3.4 Congestion Avoidance + +3.4.1 Mitigation Description + + During congestion avoidance, in the absence of loss, the TCP sender + adds approximately one segment to its congestion window during each + RTT [Jac88,Ste97,APS99]. Several researchers have observed that this + policy leads to unfair sharing of bandwidth when multiple connections + with different RTTs traverse the same bottleneck link, with the long + RTT connections obtaining only a small fraction of their fair share + of the bandwidth. + + One effective solution to this problem is to deploy fair queueing and + TCP-friendly buffer management in network routers [Sut98]. However, + in the absence of help from the network, other researchers have + investigated changes to the congestion avoidance policy at the TCP + sender, as described in [Flo91,HK98]. + +3.4.2 Research + + The "Constant-Rate" increase policy has been studied in [Flo91,HK98]. + It attempts to equalize the rate at which TCP senders increase their + sending rate during congestion avoidance. Both [Flo91] and [HK98] + illustrate cases in which the "Constant-Rate" policy largely corrects + the bias against long RTT connections, although [HK98] presents some + evidence that such a policy may be difficult to incrementally deploy + in an operational network. The proper selection of a constant (for + the constant rate of increase) is an open issue. + + The "Increase-by-K" policy can be selectively used by long RTT + connections in a heterogeneous environment. This policy simply + changes the slope of the linear increase, with connections over a + given RTT threshold adding "K" segments to the congestion window + every RTT, instead of one. [HK98] presents evidence that this + policy, when used with small values of "K", may be successful in + reducing the unfairness while keeping the link utilization high, when + a small number of connections share a bottleneck link. The selection + of the constant "K," the RTT threshold to invoke this policy, and + performance under a large number of flows are all open issues. + + + +Allman, et al. Informational [Page 21] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.4.3 Implementation Issues + + Implementation of either the "Constant-Rate" or "Increase-by-K" + policies requires a change to the congestion avoidance mechanism at + the TCP sender. In the case of "Constant-Rate," such a change must + be implemented globally. Additionally, the TCP sender must have a + reasonably accurate estimate of the RTT of the connection. The + algorithms outlined above violate the congestion avoidance algorithm + as outlined in RFC 2581 [APS99] and therefore should not be + implemented in shared networks at this time. + +3.4.4 Topology Considerations + + These solutions are applicable to all satellite networks that are + integrated with a terrestrial network, in which satellite connections + may be competing with terrestrial connections for the same bottleneck + link. + +3.4.5 Possible Interaction and Relationships with Other Research + + As shown in [PADHV99], increasing the congestion window by multiple + segments per RTT can cause TCP to drop multiple segments and force a + retransmission timeout in some versions of TCP. Therefore, the above + changes to the congestion avoidance algorithm may need to be + accompanied by a SACK-based loss recovery algorithm that can quickly + repair multiple dropped segments. + +3.5 Multiple Data Connections + +3.5.1 Mitigation Description + + One method that has been used to overcome TCP's inefficiencies in the + satellite environment is to use multiple TCP flows to transfer a + given file. The use of N TCP connections makes the sender N times + more aggressive and therefore can improve throughput in some + situations. Using N multiple TCP connections can impact the transfer + and the network in a number of ways, which are listed below. + + 1. The transfer is able to start transmission using an effective + congestion window of N segments, rather than a single segment as + one TCP flow uses. This allows the transfer to more quickly + increase the effective cwnd size to an appropriate size for the + given network. However, in some circumstances an initial window + of N segments is inappropriate for the network conditions. In + this case, a transfer utilizing more than one connection may + aggravate congestion. + + + + + +Allman, et al. Informational [Page 22] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + 2. During the congestion avoidance phase, the transfer increases the + effective cwnd by N segments per RTT, rather than the one segment + per RTT increase that a single TCP connection provides. Again, + this can aid the transfer by more rapidly increasing the effective + cwnd to an appropriate point. However, this rate of increase can + also be too aggressive for the network conditions. In this case, + the use of multiple data connections can aggravate congestion in + the network. + + 3. Using multiple connections can provide a very large overall + congestion window. This can be an advantage for TCP + implementations that do not support the TCP window scaling + extension [JBB92]. However, the aggregate cwnd size across all N + connections is equivalent to using a TCP implementation that + supports large windows. + + 4. The overall cwnd decrease in the face of dropped segments is + reduced when using N parallel connections. A single TCP + connection reduces the effective size of cwnd to half when a + single segment loss is detected. When utilizing N connections + each using a window of W bytes, a single drop reduces the window + to: + + (N * W) - (W / 2) + + Clearly this is a less dramatic reduction in the effective cwnd size + than when using a single TCP connection. And, the amount by which + the cwnd is decreased is further reduced by increasing N. + + The use of multiple data connections can increase the ability of + non-SACK TCP implementations to quickly recover from multiple dropped + segments without resorting to a timeout, assuming the dropped + segments cross connections. + + The use of multiple parallel connections makes TCP overly aggressive + for many environments and can contribute to congestive collapse in + shared networks [FF99]. The advantages provided by using multiple + TCP connections are now largely provided by TCP extensions (larger + windows, SACKs, etc.). Therefore, the use of a single TCP connection + is more "network friendly" than using multiple parallel connections. + However, using multiple parallel TCP connections may provide + performance improvement in private networks. + + + + + + + + + +Allman, et al. Informational [Page 23] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.5.2 Research + + Research on the use of multiple parallel TCP connections shows + improved performance [IL92,Hah94,AOK95,AKO96]. In addition, research + has shown that multiple TCP connections can outperform a single + modern TCP connection (with large windows and SACK) [AHKO97]. + However, these studies did not consider the impact of using multiple + TCP connections on competing traffic. [FF99] argues that using + multiple simultaneous connections to transfer a given file may lead + to congestive collapse in shared networks. + +3.5.3 Implementation Issues + + To utilize multiple parallel TCP connections a client application and + the corresponding server must be customized. As outlined in [FF99] + using multiple parallel TCP connections is not safe (from a + congestion control perspective) in shared networks and should not be + used. + +3.5.4 Topological Considerations + + As stated above, [FF99] outlines that the use of multiple parallel + connections in a shared network, such as the Internet, may lead to + congestive collapse. However, the use of multiple connections may be + safe and beneficial in private networks. The specific topology being + used will dictate the number of parallel connections required. Some + work has been done to determine the appropriate number of connections + on the fly [AKO96], but such a mechanism is far from complete. + +3.5.5 Possible Interaction and Relationships with Other Research + + Using multiple concurrent TCP connections enables use of a large + congestion window, much like the TCP window scaling option [JBB92]. + In addition, a larger initial congestion window is achieved, similar + to using [AFP98] or TCB sharing (see section 3.8). + +3.6 Pacing TCP Segments + +3.6.1 Mitigation Description + + Slow-start takes several round trips to fully open the TCP congestion + window over routes with high bandwidth-delay products. For short TCP + connections (such as WWW traffic with HTTP/1.0), the slow-start + overhead can preclude effective use of the high-bandwidth satellite + links. When senders implement slow-start restart after a TCP + connection goes idle (suggested by Jacobson and Karels [JK92]), + + + + + +Allman, et al. Informational [Page 24] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + performance is reduced in long-lived (but bursty) connections (such + as HTTP/1.1, which uses persistent TCP connections to transfer + multiple WWW page elements) [Hei97a]. + + Rate-based pacing (RBP) is a technique, used in the absence of + incoming ACKs, where the data sender temporarily paces TCP segments + at a given rate to restart the ACK clock. Upon receipt of the first + ACK, pacing is discontinued and normal TCP ACK clocking resumes. The + pacing rate may either be known from recent traffic estimates (when + restarting an idle connection or from recent prior connections), or + may be known through external means (perhaps in a point-to-point or + point-to-multipoint satellite network where available bandwidth can + be assumed to be large). + + In addition, pacing data during the first RTT of a transfer may allow + TCP to make effective use of high bandwidth-delay links even for + short transfers. However, in order to pace segments during the first + RTT a TCP will have to be using a non-standard initial congestion + window and a new mechanism to pace outgoing segments rather than send + them back-to-back. Determining an appropriate size for the initial + cwnd is an open research question. Pacing can also be used to reduce + bursts in general (due to buggy TCPs or byte counting, see section + 3.2.2 for a discussion on byte counting). + +3.6.2 Research + + Simulation studies of rate-paced pacing for WWW-like traffic have + shown reductions in router congestion and drop rates [VH97a]. In + this environment, RBP substantially improves performance compared to + slow-start-after-idle for intermittent senders, and it slightly + improves performance over burst-full-cwnd-after-idle (because of + drops) [VH98]. More recently, pacing has been suggested to eliminate + burstiness in networks with ACK filtering [BPK97]. + +3.6.3 Implementation Issues + + RBP requires only sender-side changes to TCP. Prototype + implementations of RBP are available [VH97b]. RBP requires an + additional sender timer for pacing. The overhead of timer-driven + data transfer is often considered too high for practical use. + Preliminary experiments suggest that in RBP this overhead is minimal + because RBP only requires this timer for one RTT of transmission + [VH98]. RBP is expected to make TCP more conservative in sending + bursts of data after an idle period in hosts that do not revert to + slow start after an idle period. On the other hand, RBP makes TCP + more aggressive if the sender uses the slow start algorithm to start + the ACK clock after a long idle period. + + + + +Allman, et al. Informational [Page 25] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.6.4 Topology Considerations + + RBP could be used to restart idle TCP connections for all topologies + in Section 2. Use at the beginning of new connections would be + restricted to topologies where available bandwidth can be estimated + out-of-band. + +3.6.5 Possible Interaction and Relationships with Other Research + + Pacing segments may benefit from sharing state amongst various flows + between two hosts, due to the time required to determine the needed + information. Additionally, pacing segments, rather than sending + back-to-back segments, may make estimating the available bandwidth + (as outlined in section 3.2.4) more difficult. + +3.7 TCP Header Compression + + The TCP and IP header information needed to reliably deliver packets + to a remote site across the Internet can add significant overhead, + especially for interactive applications. Telnet packets, for + example, typically carry only a few bytes of data per packet, and + standard IPv4/TCP headers add at least 40 bytes to this; IPv6/TCP + headers add at least 60 bytes. Much of this information remains + relatively constant over the course of a session and so can be + replaced by a short session identifier. + +3.7.1 Mitigation Description + + Many fields in the TCP and IP headers either remain constant during + the course of a session, change very infrequently, or can be inferred + from other sources. For example, the source and destination + addresses, as well as the IP version, protocol, and port fields + generally do not change during a session. Packet length can be + deduced from the length field of the underlying link layer protocol + provided that the link layer packet is not padded. Packet sequence + numbers in a forward data stream generally change with every packet, + but increase in a predictable manner. + + The TCP/IP header compression methods described in + [DNP99,DENP97,Jac90] reduce the overhead of TCP sessions by replacing + the data in the TCP and IP headers that remains constant, changes + slowly, or changes in a predictable manner with a short "connection + number". Using this method, the sender first sends a full TCP/IP + header, including in it a connection number that the sender will use + to reference the connection. The receiver stores the full header and + uses it as a template, filling in some fields from the limited + + + + + +Allman, et al. Informational [Page 26] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + information contained in later, compressed headers. This compression + can reduce the size of an IPv4/TCP headers from 40 to as few as 3 to + 5 bytes (3 bytes for some common cases, 5 bytes in general). + + Compression and decompression generally happen below the IP layer, at + the end-points of a given physical link (such as at two routers + connected by a serial line). The hosts on either side of the + physical link must maintain some state about the TCP connections that + are using the link. + + The decompresser must pass complete, uncompressed packets to the IP + layer. Thus header compression is transparent to routing, for + example, since an incoming packet with compressed headers is expanded + before being passed to the IP layer. + + A variety of methods can be used by the compressor/decompressor to + negotiate the use of header compression. For example, the PPP serial + line protocol allows for an option exchange, during which time the + compressor/decompressor agree on whether or not to use header + compression. For older SLIP implementations, [Jac90] describes a + mechanism that uses the first bit in the IP packet as a flag. + + The reduction in overhead is especially useful when the link is + bandwidth-limited such as terrestrial wireless and mobile satellite + links, where the overhead associated with transmitting the header + bits is nontrivial. Header compression has the added advantage that + for the case of uniformly distributed bit errors, compressing TCP/IP + headers can provide a better quality of service by decreasing the + packet error probability. The shorter, compressed packets are less + likely to be corrupted, and the reduction in errors increases the + connection's throughput. + + Extra space is saved by encoding changes in fields that change + relatively slowly by sending only their difference from their values + in the previous packet instead of their absolute values. In order to + decode headers compressed this way, the receiver keeps a copy of each + full, reconstructed TCP header after it is decoded, and applies the + delta values from the next decoded compressed header to the + reconstructed full header template. + + A disadvantage to using this delta encoding scheme where values are + encoded as deltas from their values in the previous packet is that if + a single compressed packet is lost, subsequent packets with + compressed headers can become garbled if they contain fields which + depend on the lost packet. Consider a forward data stream of packets + with compressed headers and increasing sequence numbers. If packet N + is lost, the full header of packet N+1 will be reconstructed at the + receiver using packet N-1's full header as a template. Thus the + + + +Allman, et al. Informational [Page 27] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + sequence number, which should have been calculated from packet N's + header, will be wrong, the checksum will fail, and the packet will be + discarded. When the sending TCP times out and retransmits a packet + with a full header is forwarded to re-synchronize the decompresser. + + It is important to note that the compressor does not maintain any + timers, nor does the decompresser know when an error occurred (only + the receiving TCP knows this, when the TCP checksum fails). A single + bit error will cause the decompresser to lose sync, and subsequent + packets with compressed headers will be dropped by the receiving TCP, + since they will all fail the TCP checksum. When this happens, no + duplicate acknowledgments will be generated, and the decompresser can + only re-synchronize when it receives a packet with an uncompressed + header. This means that when header compression is being used, both + fast retransmit and selective acknowledgments will not be able + correct packets lost on a compressed link. The "twice" algorithm, + described below, may be a partial solution to this problem. + + [DNP99] and [DENP97] describe TCP/IPv4 and TCP/IPv6 compression + algorithms including compressing the various IPv6 extension headers + as well as methods for compressing non-TCP streams. [DENP97] also + augments TCP header compression by introducing the "twice" algorithm. + If a particular packet fails to decompress properly, the twice + algorithm modifies its assumptions about the inferred fields in the + compressed header, assuming that a packet identical to the current + one was dropped between the last correctly decoded packet and the + current one. Twice then tries to decompress the received packet + under the new assumptions and, if the checksum passes, the packet is + passed to IP and the decompresser state has been re-synchronized. + This procedure can be extended to three or more decoding attempts. + Additional robustness can be achieved by caching full copies of + packets which don't decompress properly in the hopes that later + arrivals will fix the problem. Finally, the performance improvement + if the decompresser can explicitly request a full header is + discussed. Simulation results show that twice, in conjunction with + the full header request mechanism, can improve throughput over + uncompressed streams. + +3.7.2 Research + + [Jac90] outlines a simple header compression scheme for TCP/IP. + + In [DENP97] the authors present the results of simulations showing + that header compression is advantageous for both low and medium + bandwidth links. Simulations show that the twice algorithm, combined + with an explicit header request mechanism, improved throughput by + 10-15% over uncompressed sessions across a wide range of bit error + rates. + + + +Allman, et al. Informational [Page 28] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + Much of this improvement may have been due to the twice algorithm + quickly re-synchronizing the decompresser when a packet is lost. + This is because the twice algorithm, applied one or two times when + the decompresser becomes unsynchronized, will re-sync the + decompresser in between 83% and 99% of the cases examined. This + means that packets received correctly after twice has resynchronized + the decompresser will cause duplicate acknowledgments. This re- + enables the use of both fast retransmit and SACK in conjunction with + header compression. + +3.7.3 Implementation Issues + + Implementing TCP/IP header compression requires changes at both the + sending (compressor) and receiving (decompresser) ends of each link + that uses compression. The twice algorithm requires very little + extra machinery over and above header compression, while the explicit + header request mechanism of [DENP97] requires more extensive + modifications to the sending and receiving ends of each link that + employs header compression. Header compression does not violate + TCP's congestion control mechanisms and therefore can be safely + implemented in shared networks. + +3.7.4 Topology Considerations + + TCP/IP header compression is applicable to all of the environments + discussed in section 2, but will provide relatively more improvement + in situations where packet sizes are small (i.e., overhead is large) + and there is medium to low bandwidth and/or higher BER. When TCP's + congestion window size is large, implementing the explicit header + request mechanism, the twice algorithm, and caching packets which + fail to decompress properly becomes more critical. + +3.7.5 Possible Interaction and Relationships with Other Research + + As discussed above, losing synchronization between a sender and + receiver can cause many packet drops. The frequency of losing + synchronization and the effectiveness of the twice algorithm may + point to using a SACK-based loss recovery algorithm to reduce the + impact of multiple lost segments. However, even very robust SACK- + based algorithms may not work well if too many segments are lost. + +3.8 Sharing TCP State Among Similar Connections + + + + + + + + + +Allman, et al. Informational [Page 29] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.8.1 Mitigation Description + + Persistent TCP state information can be used to overcome limitations + in the configuration of the initial state, and to automatically tune + TCP to environments using satellite links and to coordinate multiple + TCP connections sharing a satellite link. + + TCP includes a variety of parameters, many of which are set to + initial values which can severely affect the performance of TCP + connections traversing satellite links, even though most TCP + parameters are adjusted later after the connection is established. + These parameters include initial size of cwnd and initial MSS size. + Various suggestions have been made to change these initial + conditions, to more effectively support satellite links. However, it + is difficult to select any single set of parameters which is + effective for all environments. + + An alternative to attempting to select these parameters a-priori is + sharing state across TCP connections and using this state when + initializing a new connection. For example, if all connections to a + subnet result in extended congestion windows of 1 megabyte, it is + probably more efficient to start new connections with this value, + than to rediscover it by requiring the cwnd to increase using slow + start over a period of dozens of round-trip times. + +3.8.2 Research + + Sharing state among connections brings up a number of questions such + as what information to share, with whom to share, how to share it, + and how to age shared information. First, what information is to be + shared must be determined. Some information may be appropriate to + share among TCP connections, while some information sharing may be + inappropriate or not useful. Next, we need to determine with whom to + share information. Sharing may be appropriate for TCP connections + sharing a common path to a given host. Information may be shared + among connections within a host, or even among connections between + different hosts, such as hosts on the same LAN. However, sharing + information between connections not traversing the same network may + not be appropriate. Given the state to share and the parties that + share it, a mechanism for the sharing is required. Simple state, + like MSS and RTT, is easy to share, but congestion window information + can be shared a variety of ways. The sharing mechanism determines + priorities among the sharing connections, and a variety of fairness + criteria need to be considered. Also, the mechanisms by which + information is aged require further study. See RFC 2140 for a + discussion of the security issues in both sharing state within a + single host and sharing state among hosts on a subnet. Finally, the + security concerns associated with sharing a piece of information need + + + +Allman, et al. Informational [Page 30] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + to be carefully considered before introducing such a mechanism. Many + of these open research questions must be answered before state + sharing can be widely deployed. + + The opportunity for such sharing, both among a sequence of + connections, as well as among concurrent connections, is described in + more detail in [Tou97]. The state management itself is largely an + implementation issue, however what information should be shared and + the specific ways in which the information should be shared is an + open question. + + Sharing parts of the TCB state was originally documented in T/TCP + [Bra92], and is used there to aggregate RTT values across connection + instances, to provide meaningful average RTTs, even though most + connections are expected to persist for only one RTT. T/TCP also + shares a connection identifier, a sequence number separate from the + window number and address/port pairs by which TCP connections are + typically distinguished. As a result of this shared state, T/TCP + allows a receiver to pass data in the SYN segment to the receiving + application, prior to the completion of the three-way handshake, + without compromising the integrity of the connection. In effect, this + shared state caches a partial handshake from the previous connection, + which is a variant of the more general issue of TCB sharing. + + Sharing state among connections (including transfers using non-TCP + protocols) is further investigated in [BRS99]. + +3.8.3 Implementation Issues + + Sharing TCP state across connections requires changes to the sender's + TCP stack, and possibly the receiver's TCP stack (as in the case of + T/TCP, for example). Sharing TCP state may make a particular TCP + connection more aggressive. However, the aggregate traffic should be + more conservative than a group of independent TCP connections. + Therefore, sharing TCP state should be safe for use in shared + networks. Note that state sharing does not present any new security + problems within multiuser hosts. In such a situation, users can + steal network resources from one another with or without state + sharing. + +3.8.4 Topology Considerations + + It is expected that sharing state across TCP connections may be + useful in all network environments presented in section 2. + + + + + + + +Allman, et al. Informational [Page 31] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.8.5 Possible Interaction and Relationships with Other Research + + The state sharing outlined above is very similar to the Congestion + Manager proposal [BRS99] that attempts to share congestion control + information among both TCP and UDP flows between a pair of hosts. + +3.9 ACK Congestion Control + + In highly asymmetric networks, a low-speed return link can restrict + the performance of the data flow on a high-speed forward link by + limiting the flow of acknowledgments returned to the data sender. + For example, if the data sender uses 1500 byte segments, and the + receiver generates 40 byte acknowledgments (IPv4, TCP without + options), the reverse link will congest with ACKs for asymmetries of + more than 75:1 if delayed ACKs are used, and 37:1 if every segment is + acknowledged. For a 1.5 Mb/second data link, ACK congestion will + occur for reverse link speeds below 20 kilobits/sec. These levels of + asymmetry will readily occur if the reverse link is shared among + multiple satellite receivers, as is common in many VSAT satellite + networks. If a terrestrial modem link is used as a reverse link, ACK + congestion is also likely, especially as the speed of the forward + link is increased. Current congestion control mechanisms are aimed + at controlling the flow of data segments, but do not affect the flow + of ACKs. + + In [KVR98] the authors point out that the flow of acknowledgments can + be restricted on the low-speed link not only by the bandwidth of the + link, but also by the queue length of the router. The router may + limit its queue length by counting packets, not bytes, and therefore + begin discarding ACKs even if there is enough bandwidth to forward + them. + +3.9.1 Mitigation Description + + ACK Congestion Control extends the concept of flow control for data + segments to acknowledgment segments. In the method described in + [BPK97], any intermediate router can mark an acknowledgment with an + Explicit Congestion Notification (ECN) bit once the queue occupancy + in the router exceeds a given threshold. The data sender (which + receives the acknowledgment) must "echo" the ECN bit back to the data + receiver (see section 3.3.3 for a more detailed discussion of ECN). + The proposed algorithm for marking ACK segments with an ECN bit is + Random Early Detection (RED) [FJ93]. In response to the receipt of + ECN marked data segments, the receiver will dynamically reduce the + rate of acknowledgments using a multiplicative backoff. Once + segments without ECN are received, the data receiver speeds up + acknowledgments using a linear increase, up to a rate of either 1 (no + + + + +Allman, et al. Informational [Page 32] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + delayed ACKs) or 2 (normal delayed ACKs) data segments per ACK. The + authors suggest that an ACK be generated at least once per window, + and ideally a few times per window. + + As in the RED congestion control mechanism for data flow, the + bottleneck gateway can randomly discard acknowledgments, rather than + marking them with an ECN bit, once the queue fills beyond a given + threshold. + +3.9.2 Research + + [BPK97] analyze the effect of ACK Congestion Control (ACC) on the + performance of an asymmetric network. They note that the use of ACC, + and indeed the use of any scheme which reduces the frequency of + acknowledgments, has potential unwanted side effects. Since each ACK + will acknowledge more than the usual one or two data segments, the + likelihood of segment bursts from the data sender is increased. In + addition, congestion window growth may be impeded if the receiver + grows the window by counting received ACKs, as mandated by + [Ste97,APS99]. The authors therefore combine ACC with a series of + modifications to the data sender, referred to as TCP Sender + Adaptation (SA). SA combines a limit on the number of segments sent + in a burst, regardless of window size. In addition, byte counting + (as opposed to ACK counting) is employed for window growth. Note + that byte counting has been studied elsewhere and can introduce + side-effects, as well [All98]. + + The results presented in [BPK97] indicate that using ACC and SA will + reduce the bursts produced by ACK losses in unmodified (Reno) TCP. + In cases where these bursts would lead to data loss at an + intermediate router, the ACC and SA modification significantly + improve the throughput for a single data transfer. The results + further suggest that the use of ACC and SA significantly improve + fairness between two simultaneous transfers. + + ACC is further reported to prevent the increase in round trip time + (RTT) that occurs when an unmodified TCP fills the reverse router + queue with acknowledgments. + + In networks where the forward direction is expected to suffer losses + in one of the gateways, due to queue limitations, the authors report + at best a very slight improvement in performance for ACC and SA, + compared to unmodified Reno TCP. + + + + + + + + +Allman, et al. Informational [Page 33] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.9.3 Implementation Issues + + Both ACC and SA require modification of the sending and receiving + hosts, as well as the bottleneck gateway. The current research + suggests that implementing ACC without the SA modifications results + in a data sender which generates potentially disruptive segment + bursts. It should be noted that ACC does require host modifications + if it is implemented in the way proposed in [BPK97]. The authors + note that ACC can be implemented by discarding ACKs (which requires + only a gateway modification, but no changes in the hosts), as opposed + to marking them with ECN. Such an implementation may, however, + produce bursty data senders if it is not combined with a burst + mitigation technique. ACC requires changes to the standard ACKing + behavior of a receiving TCP and therefore is not recommended for use + in shared networks. + +3.9.4 Topology Considerations + + Neither ACC nor SA require the storage of state in the gateway. + These schemes should therefore be applicable for all topologies, + provided that the hosts using the satellite or hybrid network can be + modified. However, these changes are expected to be especially + beneficial to networks containing asymmetric satellite links. + +3.9.5 Possible Interaction and Relationships with Other Research + + Note that ECN is a pre-condition for using ACK congestion control. + Additionally, the ACK Filtering algorithm discussed in the next + section attempts to solve the same problem as ACC. Choosing between + the two algorithms (or another mechanism) is currently an open + research question. + +3.10 ACK Filtering + + ACK Filtering (AF) is designed to address the same ACK congestion + effects described in 3.9. Contrary to ACC, however, AF is designed + to operate without host modifications. + +3.10.1 Mitigation Description + + AF takes advantage of the cumulative acknowledgment structure of TCP. + The bottleneck router in the reverse direction (the low speed link) + must be modified to implement AF. Upon receipt of a segment which + represents a TCP acknowledgment, the router scans the queue for + redundant ACKs for the same connection, i.e. ACKs which acknowledge + portions of the window which are included in the most recent ACK. + All of these "earlier" ACKs are removed from the queue and discarded. + + + + +Allman, et al. Informational [Page 34] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + The router does not store state information, but does need to + implement the additional processing required to find and remove + segments from the queue upon receipt of an ACK. + +3.10.2 Research + + [BPK97] analyzes the effects of AF. As is the case in ACC, the use + of ACK filtering alone would produce significant sender bursts, since + the ACKs will be acknowledging more previously-unacknowledged data. + The SA modifications described in 3.9.2 could be used to prevent + those bursts, at the cost of requiring host modifications. To + prevent the need for modifications in the TCP stack, AF is more + likely to be paired with the ACK Reconstruction (AR) technique, which + can be implemented at the router where segments exit the slow reverse + link. + + AR inspects ACKs exiting the link, and if it detects large "gaps" in + the ACK sequence, it generates additional ACKs to reconstruct an + acknowledgment flow which more closely resembles what the data sender + would have seen had ACK Filtering not been introduced. AR requires + two parameters; one parameter is the desired ACK frequency, while the + second controls the spacing, in time, between the release of + consecutive reconstructed ACKs. + + In [BPK97], the authors show the combination of AF and AR to increase + throughput, in the networks studied, over both unmodified TCP and the + ACC/SA modifications. Their results also strongly suggest that the + use of AF alone, in networks where congestion losses are expected, + decreases performance (even below the level of unmodified TCP Reno) + due to sender bursting. + + AF delays acknowledgments from arriving at the receiver by dropping + earlier ACKs in favor of later ACKs. This process can cause a slight + hiccup in the transmission of new data by the TCP sender. + +3.10.3 Implementation Issues + + Both ACK Filtering and ACK Reconstruction require only router + modification. However, the implementation of AR requires some + storage of state information in the exit router. While AF does not + require storage of state information, its use without AR (or SA) + could produce undesired side effects. Furthermore, more research is + required regarding appropriate ranges for the parameters needed in + AR. + + + + + + + +Allman, et al. Informational [Page 35] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +3.10.4 Topology Considerations + + AF and AR appear applicable to all topologies, assuming that the + storage of state information in AR does not prove to be prohibitive + for routers which handle large numbers of flows. The fact that TCP + stack modifications are not required for AF/AR makes this approach + attractive for hybrid networks and networks with diverse types of + hosts. These modifications, however, are expected to be most + beneficial in asymmetric network paths. + + On the other hand, the implementation of AF/AR requires the routers + to examine the TCP header, which prohibits their use in secure + networks where IPSEC is deployed. In such networks, AF/AR can be + effective only inside the security perimeter of a private, or virtual + private network, or in private networks where the satellite link is + protected only by link-layer encryption (as opposed to IPSEC). ACK + Filtering is safe to use in shared networks (from a congestion + control point-of-view), as the number of ACKs can only be reduced, + which makes TCP less aggressive. However, note that while TCP is + less aggressive, the delays that AF induces (outlined above) can lead + to larger bursts than would otherwise occur. + +3.10.5 Possible Interaction and Relationships with Other Research + + ACK Filtering attempts to solve the same problem as ACK Congestion + Control (as outlined in section 3.9). Which of the two algorithms is + more appropriate is currently an open research question. + +4 Conclusions + + This document outlines TCP items that may be able to mitigate the + performance problems associated with using TCP in networks containing + satellite links. These mitigations are not IETF standards track + mechanisms and require more study before being recommended by the + IETF. The research community is encouraged to examine the above + mitigations in an effort to determine which are safe for use in + shared networks such as the Internet. + +5 Security Considerations + + Several of the above sections noted specific security concerns which + a given mitigation aggravates. + + Additionally, any form of wireless communication link is more + susceptible to eavesdropping security attacks than standard wire- + based links due to the relative ease with which an attacker can watch + the network and the difficultly in finding attackers monitoring the + network. + + + +Allman, et al. Informational [Page 36] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +6 Acknowledgments + + Our thanks to Aaron Falk and Sally Floyd, who provided very helpful + comments on drafts of this document. + +7 References + + [AFP98] Allman, M., Floyd, S. and C. Partridge, "Increasing TCP's + Initial Window", RFC 2414, September 1998. + + [AGS99] Allman, M., Glover, D. and L. Sanchez, "Enhancing TCP Over + Satellite Channels using Standard Mechanisms", BCP 28, RFC + 2488, January 1999. + + [AHKO97] Mark Allman, Chris Hayes, Hans Kruse, Shawn Ostermann. TCP + Performance Over Satellite Links. In Proceedings of the + 5th International Conference on Telecommunication Systems, + March 1997. + + [AHO98] Mark Allman, Chris Hayes, Shawn Ostermann. An Evaluation + of TCP with Larger Initial Windows. Computer Communication + Review, 28(3), July 1998. + + [AKO96] Mark Allman, Hans Kruse, Shawn Ostermann. An Application- + Level Solution to TCP's Satellite Inefficiencies. In + Proceedings of the First International Workshop on + Satellite-based Information Services (WOSBIS), November + 1996. + + [All97a] Mark Allman. Improving TCP Performance Over Satellite + Channels. Master's thesis, Ohio University, June 1997. + + [All97b] Mark Allman. Fixing Two BSD TCP Bugs. Technical Report + CR-204151, NASA Lewis Research Center, October 1997. + + [All98] Mark Allman. On the Generation and Use of TCP + Acknowledgments. ACM Computer Communication Review, 28(5), + October 1998. + + [AOK95] Mark Allman, Shawn Ostermann, Hans Kruse. Data Transfer + Efficiency Over Satellite Circuits Using a Multi-Socket + Extension to the File Transfer Protocol (FTP). In + Proceedings of the ACTS Results Conference, NASA Lewis + Research Center, September 1995. + + [AP99] Mark Allman, Vern Paxson. On Estimating End-to-End Network + Path Properties. ACM SIGCOMM, September 1999. + + + + +Allman, et al. Informational [Page 37] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + [APS99] Allman, M., Paxson, V. and W. Richard Stevens, "TCP + Congestion Control", RFC 2581, April 1999. + + [BCC+98] Braden, B., Clark, D., Crowcroft, J., Davie, B., Deering, + S., Estrin, D., Floyd, S., Jacobson, V., Minshall, G., + Partridge, C., Peterson, L., Ramakrishnan, K., Shenker, S., + Wroclawski, J. and L. Zhang, "Recommendations on Queue + Management and Congestion Avoidance in the Internet", RFC + 2309, April 1998. + + [BKVP97] B. Bakshi and P. Krishna and N. Vaidya and D. Pradham, + "Improving Performance of TCP over Wireless Networks", 17th + International Conference on Distributed Computing Systems + (ICDCS), May 1997. + + [BPK97] Hari Balakrishnan, Venkata N. Padmanabhan, and Randy H. + Katz. The Effects of Asymmetry on TCP Performance. In + Proceedings of the ACM/IEEE Mobicom, Budapest, Hungary, + ACM. September, 1997. + + [BPK98] Hari Balakrishnan, Venkata Padmanabhan, Randy H. Katz. The + Effects of Asymmetry on TCP Performance. ACM Mobile + Networks and Applications (MONET), 1998 (to appear). + + [BPSK96] H. Balakrishnan and V. Padmanabhan and S. Sechan and R. + Katz, "A Comparison of Mechanisms for Improving TCP + Performance over Wireless Links", ACM SIGCOMM, August 1996. + + [Bra89] Braden, R., "Requirements for Internet Hosts -- + Communication Layers", STD 3, RFC 1122, October 1989. + + [Bra92] Braden, R., "Transaction TCP -- Concepts", RFC 1379, + September 1992. + + [Bra94] Braden, R., "T/TCP -- TCP Extensions for Transactions: + Functional Specification", RFC 1644, July 1994. + + [BRS99] Hari Balakrishnan, Hariharan Rahul, and Srinivasan Seshan. + An Integrated Congestion Management Architecture for + Internet Hosts. ACM SIGCOMM, September 1999. + + [ddKI99] M. deVivo, G.O. deVivo, R. Koeneke, G. Isern. Internet + Vulnerabilities Related to TCP/IP and T/TCP. Computer + Communication Review, 29(1), January 1999. + + + + + + + +Allman, et al. Informational [Page 38] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + [DENP97] Mikael Degermark, Mathias Engan, Bjorn Nordgren, Stephen + Pink. Low-Loss TCP/IP Header Compression for Wireless + Networks. ACM/Baltzer Journal on Wireless Networks, vol.3, + no.5, p. 375-87. + + [DMT96] R. C. Durst and G. J. Miller and E. J. Travis, "TCP + Extensions for Space Communications", Mobicom 96, ACM, USA, + 1996. + + [DNP99] Degermark, M., Nordgren, B. and S. Pink, "IP Header + Compression", RFC 2507, February 1999. + + [FF96] Kevin Fall, Sally Floyd. Simulation-based Comparisons of + Tahoe, Reno, and SACK TCP. Computer Communication Review, + V. 26 N. 3, July 1996, pp. 5-21. + + [FF99] Sally Floyd, Kevin Fall. Promoting the Use of End-to-End + Congestion Control in the Internet, IEEE/ACM Transactions + on Networking, August 1999. + + [FH99] Floyd, S. and T. Henderson, "The NewReno Modification to + TCP's Fast Recovery Algorithm", RFC 2582, April 1999. + + [FJ93] Sally Floyd and Van Jacobson. Random Early Detection + Gateways for Congestion Avoidance, IEEE/ACM Transactions on + Networking, V. 1 N. 4, August 1993. + + [Flo91] Sally Floyd. Connections with Multiple Congested Gateways + in Packet-Switched Networks, Part 1: One-way Traffic. ACM + Computer Communications Review, V. 21, N. 5, October 1991. + + [Flo94] Sally Floyd. TCP and Explicit Congestion Notification, ACM + Computer Communication Review, V. 24 N. 5, October 1994. + + [Flo99] Sally Floyd. "Re: TCP and out-of-order delivery", email to + end2end-interest mailing list, February, 1999. + + [Hah94] Jonathan Hahn. MFTP: Recent Enhancements and Performance + Measurements. Technical Report RND-94-006, NASA Ames + Research Center, June 1994. + + [Hay97] Chris Hayes. Analyzing the Performance of New TCP + Extensions Over Satellite Links. Master's Thesis, Ohio + University, August 1997. + + [HK98] Tom Henderson, Randy Katz. On Improving the Fairness of + TCP Congestion Avoidance. Proceedings of IEEE Globecom `98 + Conference, 1998. + + + +Allman, et al. Informational [Page 39] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + [HK99] Tim Henderson, Randy Katz. Transport Protocols for + Internet-Compatible Satellite Networks, IEEE Journal on + Selected Areas of Communications, February, 1999. + + [Hoe95] J. Hoe, Startup Dynamics of TCP's Congestion Control and + Avoidance Schemes. Master's Thesis, MIT, 1995. + + [Hoe96] Janey Hoe. Improving the Startup Behavior of a Congestion + Control Scheme for TCP. In ACM SIGCOMM, August 1996. + + [IL92] David Iannucci and John Lakashman. MFTP: Virtual TCP + Window Scaling Using Multiple Connections. Technical + Report RND-92-002, NASA Ames Research Center, January 1992. + + [Jac88] Van Jacobson. Congestion Avoidance and Control. In + Proceedings of the SIGCOMM '88, ACM. August, 1988. + + [Jac90] Jacobson, V., "Compressing TCP/IP Headers", RFC 1144, + February 1990. + + [JBB92] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions for + High Performance", RFC 1323, May 1992. + + [JK92] Van Jacobson and Mike Karels. Congestion Avoidance and + Control. Originally appearing in the proceedings of + SIGCOMM '88 by Jacobson only, this revised version includes + an additional appendix. The revised version is available + at ftp://ftp.ee.lbl.gov/papers/congavoid.ps.Z. 1992. + + [Joh95] Stacy Johnson. Increasing TCP Throughput by Using an + Extended Acknowledgment Interval. Master's Thesis, Ohio + University, June 1995. + + [KAGT98] Hans Kruse, Mark Allman, Jim Griner, Diepchi Tran. HTTP + Page Transfer Rates Over Geo-Stationary Satellite Links. + March 1998. Proceedings of the Sixth International + Conference on Telecommunication Systems. + + [Kes91] Srinivasan Keshav. A Control Theoretic Approach to Flow + Control. In ACM SIGCOMM, September 1991. + + [KM97] S. Keshav, S. Morgan. SMART Retransmission: Performance + with Overload and Random Losses. Proceeding of Infocom. + 1997. + + + + + + + +Allman, et al. Informational [Page 40] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + [KVR98] Lampros Kalampoukas, Anujan Varma, and K. K.Ramakrishnan. + Improving TCP Throughput Over Two-Way Asymmetric Links: + Analysis and Solutions. Measurement and Modeling of + Computer Systems, 1998, Pages 78-89. + + [MM96a] M. Mathis, J. Mahdavi, "Forward Acknowledgment: Refining + TCP Congestion Control," Proceedings of SIGCOMM'96, August, + 1996, Stanford, CA. Available from + http://www.psc.edu/networking/papers/papers.html + + [MM96b] M. Mathis, J. Mahdavi, "TCP Rate-Halving with Bounding + Parameters" Available from + http://www.psc.edu/networking/papers/FACKnotes/current. + + [MMFR96] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP + Selective Acknowledgment Options", RFC 2018, October 1996. + + [MSMO97] M. Mathis, J. Semke, J. Mahdavi, T. Ott, "The Macroscopic + Behavior of the TCP Congestion Avoidance + Algorithm",Computer Communication Review, volume 27, + number3, July 1997. Available from + http://www.psc.edu/networking/papers/papers.html + + [MV98] Miten N. Mehta and Nitin H. Vaidya. Delayed Duplicate- + Acknowledgments: A Proposal to Improve Performance of TCP + on Wireless Links. Technical Report 98-006, Department of + Computer Science, Texas A&M University, February 1998. + + [Nic97] Kathleen Nichols. Improving Network Simulation with + Feedback. Com21, Inc. Technical Report. Available from + http://www.com21.com/pages/papers/068.pdf. + + [PADHV99] Paxson, V., Allman, M., Dawson, S., Heavens, I. and B. + Volz, "Known TCP Implementation Problems", RFC 2525, March + 1999. + + [Pax97] Vern Paxson. Automated Packet Trace Analysis of TCP + Implementations. In Proceedings of ACM SIGCOMM, September + 1997. + + [PN98] Poduri, K. and K. Nichols, "Simulation Studies of Increased + Initial TCP Window Size", RFC 2415, September 1998. + + [Pos81] Postel, J., "Transmission Control Protocol", STD 7, RFC + 793, September 1981. + + + + + + +Allman, et al. Informational [Page 41] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + [RF99] Ramakrishnan, K. and S. Floyd, "A Proposal to add Explicit + Congestion Notification (ECN) to IP", RFC 2481, January + 1999. + + [SF98] Nihal K. G. Samaraweera and Godred Fairhurst, + "Reinforcement of TCP error Recovery for Wireless + Communication", Computer Communication Review, volume 28, + number 2, April 1998. + + [SP98] Shepard, T. and C. Partridge, "When TCP Starts Up With Four + Packets Into Only Three Buffers", RFC 2416, September 1998. + + [Ste97] Stevens, W., "TCP Slow Start, Congestion Avoidance, Fast + Retransmit, and Fast Recovery Algorithms", RFC 2001, + January 1997. + + [Sut98] B. Suter, T. Lakshman, D. Stiliadis, and A. Choudhury. + Design Considerations for Supporting TCP with Per-flow + Queueing. Proceedings of IEEE Infocom `98 Conference, + 1998. + + [Tou97] Touch, J., "TCP Control Block Interdependence", RFC 2140, + April 1997. + + [VH97a] Vikram Visweswaraiah and John Heidemann. Improving Restart + of Idle TCP Connections. Technical Report 97-661, + University of Southern California, 1997. + + [VH97b] Vikram Visweswaraiah and John Heidemann. Rate-based pacing + Source Code Distribution, Web page: + http://www.isi.edu/lsam/publications/rate_based_pacing/README.html + November, 1997. + + [VH98] Vikram Visweswaraiah and John Heidemann. Improving Restart + of Idle TCP Connections (revised). Submitted for + publication. + + + + + + + + + + + + + + + +Allman, et al. Informational [Page 42] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +8 Authors' Addresses + + Mark Allman + NASA Glenn Research Center/BBN Technologies + Lewis Field + 21000 Brookpark Rd. MS 54-2 + Cleveland, OH 44135 + + EMail: mallman@grc.nasa.gov + http://roland.grc.nasa.gov/~mallman + + + Spencer Dawkins + Nortel + P.O.Box 833805 + Richardson, TX 75083-3805 + + EMail: Spencer.Dawkins.sdawkins@nt.com + + + Dan Glover + NASA Glenn Research Center + Lewis Field + 21000 Brookpark Rd. MS 3-6 + Cleveland, OH 44135 + + EMail: Daniel.R.Glover@grc.nasa.gov + http://roland.grc.nasa.gov/~dglover + + + Jim Griner + NASA Glenn Research Center + Lewis Field + 21000 Brookpark Rd. MS 54-2 + Cleveland, OH 44135 + + EMail: jgriner@grc.nasa.gov + http://roland.grc.nasa.gov/~jgriner + + + Diepchi Tran + NASA Glenn Research Center + Lewis Field + 21000 Brookpark Rd. MS 54-2 + Cleveland, OH 44135 + + EMail: dtran@grc.nasa.gov + + + + +Allman, et al. Informational [Page 43] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + Tom Henderson + University of California at Berkeley + Phone: +1 (510) 642-8919 + + EMail: tomh@cs.berkeley.edu + URL: http://www.cs.berkeley.edu/~tomh/ + + + John Heidemann + University of Southern California/Information Sciences Institute + 4676 Admiralty Way + Marina del Rey, CA 90292-6695 + + EMail: johnh@isi.edu + + + Joe Touch + University of Southern California/Information Sciences Institute + 4676 Admiralty Way + Marina del Rey, CA 90292-6601 + USA + + Phone: +1 310-448-9151 + Fax: +1 310-823-6714 + URL: http://www.isi.edu/touch + EMail: touch@isi.edu + + + Hans Kruse + J. Warren McClure School of Communication Systems Management + Ohio University + 9 S. College Street + Athens, OH 45701 + + Phone: 740-593-4891 + Fax: 740-593-4889 + EMail: hkruse1@ohiou.edu + http://www.csm.ohiou.edu/kruse + + + Shawn Ostermann + School of Electrical Engineering and Computer Science + Ohio University + 416 Morton Hall + Athens, OH 45701 + + Phone: (740) 593-1234 + EMail: ostermann@cs.ohiou.edu + + + +Allman, et al. Informational [Page 44] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + + Keith Scott + The MITRE Corporation + M/S W650 + 1820 Dolley Madison Blvd. + McLean VA 22102-3481 + + EMail: kscott@mitre.org + + + Jeffrey Semke + Pittsburgh Supercomputing Center + 4400 Fifth Ave. + Pittsburgh, PA 15213 + + EMail: semke@psc.edu + http://www.psc.edu/~semke + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Allman, et al. Informational [Page 45] + +RFC 2760 Ongoing TCP Research Related to Satellites February 2000 + + +9 Full Copyright Statement + + Copyright (C) The Internet Society (2000). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + + + + + + + + + + + +Allman, et al. Informational [Page 46] + diff --git a/ext/picotcp/RFC/rfc3135.txt b/ext/picotcp/RFC/rfc3135.txt new file mode 100644 index 0000000..1138e09 --- /dev/null +++ b/ext/picotcp/RFC/rfc3135.txt @@ -0,0 +1,2523 @@ + + + + + + +Network Working Group J. Border +Request for Comments: 3135 Hughes Network Systems +Category: Informational M. Kojo + University of Helsinki + J. Griner + NASA Glenn Research Center + G. Montenegro + Sun Microsystems, Inc. + Z. Shelby + University of Oulu + June 2001 + + + Performance Enhancing Proxies Intended to Mitigate Link-Related + Degradations + +Status of this Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard of any kind. Distribution of this + memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2001). All Rights Reserved. + +Abstract + + This document is a survey of Performance Enhancing Proxies (PEPs) + often employed to improve degraded TCP performance caused by + characteristics of specific link environments, for example, in + satellite, wireless WAN, and wireless LAN environments. Different + types of Performance Enhancing Proxies are described as well as the + mechanisms used to improve performance. Emphasis is put on proxies + operating with TCP. In addition, motivations for their development + and use are described along with some of the consequences of using + them, especially in the context of the Internet. + +Table of Contents + + 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 3 + 2. Types of Performance Enhancing Proxies . . . . . . . . . . . . 4 + 2.1 Layering . . . . . . . . . . . . . . . . . . . . . . . . . . . 4 + 2.1.1 Transport Layer PEPs . . . . . . . . . . . . . . . . . . . . 5 + 2.1.2 Application Layer PEPs . . . . . . . . . . . . . . . . . . . 5 + 2.2 Distribution . . . . . . . . . . . . . . . . . . . . . . . . . 6 + 2.3 Implementation Symmetry . . . . . . . . . . . . . . . . . . . 6 + 2.4 Split Connections . . . . . . . . . . . . . . . . . . . . . . 7 + + + +Border, et al. Informational [Page 1] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + 2.5 Transparency . . . . . . . . . . . . . . . . . . . . . . . . . 8 + 3. PEP Mechanisms . . . . . . . . . . . . . . . . . . . . . . . . 9 + 3.1 TCP ACK Handling . . . . . . . . . . . . . . . . . . . . . . . 9 + 3.1.1 TCP ACK Spacing . . . . . . . . . . . . . . . . . . . . . . 9 + 3.1.2 Local TCP Acknowledgements . . . . . . . . . . . . . . . . . 9 + 3.1.3 Local TCP Retransmissions . . . . . . . . . . . . . . . . . 9 + 3.1.4 TCP ACK Filtering and Reconstruction . . . . . . . . . . . . 10 + 3.2 Tunneling . . . . . . . . . . . . . . . . . . . . . . . . . . 10 + 3.3 Compression . . . . . . . . . . . . . . . . . . . . . . . . . 10 + 3.4 Handling Periods of Link Disconnection with TCP . . . . . . . 11 + 3.5 Priority-based Multiplexing . . . . . . . . . . . . . . . . . 12 + 3.6 Protocol Booster Mechanisms . . . . . . . . . . . . . . . . . 13 + 4. Implications of Using PEPs . . . . . . . . . . . . . . . . . . 14 + 4.1 The End-to-end Argument . . . . . . . . . . . . . . . . . . . 14 + 4.1.1 Security . . . . . . . . . . . . . . . . . . . . . . . . . . 14 + 4.1.1.1 Security Implications . . . . . . . . . . . . . . . . . . 15 + 4.1.1.2 Security Implication Mitigations . . . . . . . . . . . . . 16 + 4.1.1.3 Security Research Related to PEPs . . . . . . . . . . . . 16 + 4.1.2 Fate Sharing . . . . . . . . . . . . . . . . . . . . . . . . 16 + 4.1.3 End-to-end Reliability . . . . . . . . . . . . . . . . . . . 17 + 4.1.4 End-to-end Failure Diagnostics . . . . . . . . . . . . . . . 19 + 4.2 Asymmetric Routing . . . . . . . . . . . . . . . . . . . . . . 19 + 4.3 Mobile Hosts . . . . . . . . . . . . . . . . . . . . . . . . . 20 + 4.4 Scalability . . . . . . . . . . . . . . . . . . . . . . . . . 20 + 4.5 Other Implications of Using PEPs . . . . . . . . . . . . . . . 21 + 5. PEP Environment Examples . . . . . . . . . . . . . . . . . . . 21 + 5.1 VSAT Environments . . . . . . . . . . . . . . . . . . . . . . 21 + 5.1.1 VSAT Network Characteristics . . . . . . . . . . . . . . . . 22 + 5.1.2 VSAT Network PEP Implementations . . . . . . . . . . . . . . 23 + 5.1.3 VSAT Network PEP Motivation . . . . . . . . . . . . . . . . 24 + 5.2 W-WAN Environments . . . . . . . . . . . . . . . . . . . . . . 25 + 5.2.1 W-WAN Network Characteristics . . . . . . . . . . . . . . . 25 + 5.2.2 W-WAN PEP Implementations . . . . . . . . . . . . . . . . . 26 + 5.2.2.1 Mowgli System . . . . . . . . . . . . . . . . . . . . . . 26 + 5.2.2.2 Wireless Application Protocol (WAP) . . . . . . . . . . . 28 + 5.2.3 W-WAN PEP Motivation . . . . . . . . . . . . . . . . . . . . 29 + 5.3 W-LAN Environments . . . . . . . . . . . . . . . . . . . . . . 30 + 5.3.1 W-LAN Network Characteristics . . . . . . . . . . . . . . . 30 + 5.3.2 W-LAN PEP Implementations: Snoop . . . . . . . . . . . . . . 31 + 5.3.3 W-LAN PEP Motivation . . . . . . . . . . . . . . . . . . . . 33 + 6. Security Considerations . . . . . . . . . . . . . . . . . . . . 34 + 7. IANA Considerations . . . . . . . . . . . . . . . . . . . . . . 34 + 8. Acknowledgements . . . . . . . . . . . . . . . . . . . . . . . 34 + 9. References . . . . . . . . . . . . . . . . . . . . . . . . . . 35 + 10. Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . 39 + Appendix A - PEP Terminology Summary . . . . . . . . . . . . . . . 41 + Full Copyright Statement . . . . . . . . . . . . . . . . . . . . . 45 + + + + +Border, et al. Informational [Page 2] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +1. Introduction + + The Transmission Control Protocol [RFC0793] (TCP) is used as the + transport layer protocol by many Internet and intranet applications. + However, in certain environments, TCP and other higher layer protocol + performance is limited by the link characteristics of the + environment. + + This document is a survey of Performance Enhancing Proxy (PEP) + performance migitigation techniques. A PEP is used to improve the + performance of the Internet protocols on network paths where native + performance suffers due to characteristics of a link or subnetwork on + the path. This document is informational and does not make + recommendations about using PEPs or not using them. Distinct + standards track recommendations for the performance mitigation of TCP + over links with high error rates, links with low bandwidth, and so + on, have been developed or are in development by the Performance + Implications of Link Characteristics WG (PILC) [PILCWEB]. + + Link design choices may have a significant influence on the + performance and efficiency of the Internet. However, not all link + characteristics, for example, high latency, can be compensated for by + choices in the link layer design. And, the cost of compensating for + some link characteristics may be prohibitive for some technologies. + The techniques surveyed here are applied to existing link + technologies. When new link technologies are designed, they should + be designed so that these techniques are not required, if at all + possible. + + This document does not advocate the use of PEPs in any general case. + On the contrary, we believe that the end-to-end principle in + designing Internet protocols should be retained as the prevailing + approach and PEPs should be used only in specific environments and + circumstances where end-to-end mechanisms providing similar + performance enhancements are not available. In any environment where + one might consider employing a PEP for improved performance, an end + user (or, in some cases, the responsible network administrator) + should be aware of the PEP and the choice of employing PEP + functionality should be under the control of the end user, especially + if employing the PEP would interfere with end-to-end usage of IP + layer security mechanisms or otherwise have undesirable implications + in some circumstances. This would allow the user to choose end-to- + end IP at all times but, of course, without the performance + enhancements that employing the PEP may yield. + + This survey does not make recommendations, for or against, with + respect to using PEPs. Standards track recommendations have been or + are being developed within the IETF for individual link + + + +Border, et al. Informational [Page 3] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + characteristics, e.g., links with high error rates, links with low + bandwidth, links with asymmetric bandwidth, etc., by the Performance + Implications of Link Characteristics WG (PILC) [PILCWEB]. + + The remainder of this document is organized as follows. Section 2 + provides an overview of different kinds of PEP implementations. + + Section 3 discusses some of the mechanisms which PEPs may employ in + order to improve performance. Section 4 discusses some of the + implications with respect to using PEPs, especially in the context of + the global Internet. Finally, Section 5 discusses some example + environments where PEPs are used: satellite very small aperture + terminal (VSAT) environments, mobile wireless WAN (W-WAN) + environments and wireless LAN (W-LAN) environments. A summary of PEP + terminology is included in an appendix (Appendix A). + +2. Types of Performance Enhancing Proxies + + There are many types of Performance Enhancing Proxies. Different + types of PEPs are used in different environments to overcome + different link characteristics which affect protocol performance. + Note that enhancing performance is not necessarily limited in scope + to throughput. Other performance related aspects, like usability of + a link, may also be addressed. For example, [M-TCP] addresses the + issue of keeping TCP connections alive during periods of + disconnection in wireless networks. + + The following sections describe some of the key characteristics which + differentiate different types of PEPs. + +2.1 Layering + + In principle, a PEP implementation may function at any protocol layer + but typically it functions at one or two layers only. In this + document we focus on PEP implementations that function at the + transport layer or at the application layer as such PEPs are most + commonly used to enhance performance over links with problematic + characteristics. A PEP implementation may also operate below the + network layer, that is, at the link layer, but this document pays + only little attention to such PEPs as link layer mechanisms can be + and typically are implemented transparently to network and higher + layers, requiring no modifications to protocol operation above the + link layer. It should also be noted that some PEP implementations + operate across several protocol layers by exploiting the protocol + information and possibly modifying the protocol operation at more + than one layer. For such a PEP it may be difficult to define at + which layer(s) it exactly operates on. + + + + +Border, et al. Informational [Page 4] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +2.1.1 Transport Layer PEPs + + Transport layer PEPs operate at the transport level. They may be + aware of the type of application being carried by the transport layer + but, at most, only use this information to influence their behavior + with respect to the transport protocol; they do not modify the + application protocol in any way, but let the application protocol + operate end-to-end. Most transport layer PEP implementations + interact with TCP. Such an implementation is called a TCP + Performance Enhancing Proxy (TCP PEP). For example, in an + environment where ACKs may bunch together causing undesirable data + segment bursts, a TCP PEP may be used to simply modify the ACK + spacing in order to improve performance. On the other hand, in an + environment with a large bandwidth*delay product, a TCP PEP may be + used to alter the behavior of the TCP connection by generating local + acknowledgments to TCP data segments in order to improve the + connection's throughput. + + The term TCP spoofing is sometimes used synonymously for TCP PEP + functionality. However, the term TCP spoofing more accurately + describes the characteristic of intercepting a TCP connection in the + middle and terminating the connection as if the interceptor is the + intended destination. While this is a characteristic of many TCP PEP + implementations, it is not a characteristic of all TCP PEP + implementations. + +2.1.2 Application Layer PEPs + + Application layer PEPs operate above the transport layer. Today, + different kinds of application layer proxies are widely used in the + Internet. Such proxies include Web caches and relay Mail Transfer + Agents (MTA) and they typically try to improve performance or service + availability and reliability in general and in a way which is + applicable in any environment but they do not necessarily include any + optimizations that are specific to certain link characteristics. + + Application layer PEPs, on the other hand, can be implemented to + improve application protocol as well as transport layer performance + with respect to a particular application being used with a particular + type of link. An application layer PEP may have the same + functionality as the corresponding regular proxy for the same + application (e.g., relay MTA or Web caching proxy) but extended with + link-specific optimizations of the application protocol operation. + + Some application protocols employ extraneous round trips, overly + verbose headers and/or inefficient header encoding which may have a + significant impact on performance, in particular, with long delay and + slow links. This unnecessary overhead can be reduced, in general or + + + +Border, et al. Informational [Page 5] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + for a particular type of link, by using an application layer PEP in + an intermediate node. Some examples of application layer PEPs which + have been shown to improve performance on slow wireless WAN links are + described in [LHKR96] and [CTC+97]. + +2.2 Distribution + + A PEP implementation may be integrated, i.e., it comprises a single + PEP component implemented within a single node, or distributed, i.e., + it comprises two or more PEP components, typically implemented in + multiple nodes. An integrated PEP implementation represents a single + point at which performance enhancement is applied. For example, a + single PEP component might be implemented to provide impedance + matching at the point where wired and wireless links meet. + + A distributed PEP implementation is generally used to surround a + particular link for which performance enhancement is desired. For + example, a PEP implementation for a satellite connection may be + distributed between two PEPs located at each end of the satellite + link. + +2.3 Implementation Symmetry + + A PEP implementation may be symmetric or asymmetric. Symmetric PEPs + use identical behavior in both directions, i.e., the actions taken by + the PEP occur independent from which interface a packet is received. + Asymmetric PEPs operate differently in each direction. The direction + can be defined in terms of the link (e.g., from a central site to a + remote site) or in terms of protocol traffic (e.g., the direction of + TCP data flow, often called the TCP data channel, or the direction of + TCP ACK flow, often called the TCP ACK channel). An asymmetric PEP + implementation is generally used at a point where the characteristics + of the links on each side of the PEP differ or with asymmetric + protocol traffic. For example, an asymmetric PEP might be placed at + the intersection of wired and wireless networks or an asymmetric + application layer PEP might be used for the request-reply type of + HTTP traffic. A PEP implementation may also be both symmetric and + asymmetric at the same time with regard to different mechanisms it + employs. (PEP mechanisms are described in Section 3.) + + Whether a PEP implementation is symmetric or asymmetric is + independent of whether the PEP implementation is integrated or + distributed. In other words, a distributed PEP implementation might + operate symmetrically at each end of a link (i.e., the two PEPs + function identically). On the other hand, a distributed PEP + implementation might operate asymmetrically, with a different PEP + implementation at each end of the link. Again, this usually is used + with asymmetric links. For example, for a link with an asymmetric + + + +Border, et al. Informational [Page 6] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + amount of bandwidth available in each direction, the PEP on the end + of the link forwarding traffic in the direction with a large amount + of bandwidth might focus on locally acknowledging TCP traffic in + order to use the available bandwidth. At the same time, the PEP on + the end of the link forwarding traffic in the direction with very + little bandwidth might focus on reducing the amount of TCP + acknowledgement traffic being forwarded across the link (to keep the + link from congesting). + +2.4 Split Connections + + A split connection TCP implementation terminates the TCP connection + received from an end system and establishes a corresponding TCP + connection to the other end system. In a distributed PEP + implementation, this is typically done to allow the use of a third + connection between two PEPs optimized for the link. This might be a + TCP connection optimized for the link or it might be another + protocol, for example, a proprietary protocol running on top of UDP. + Also, the distributed implementation might use a separate connection + between the proxies for each TCP connection or it might multiplex the + data from multiple TCP connections across a single connection between + the PEPs. + + In an integrated PEP split connection TCP implementation, the PEP + again terminates the connection from one end system and originates a + separate connection to the other end system. [I-TCP] documents an + example of a single PEP split connection implementation. + + Many integrated PEPs use a split connection implementation in order + to address a mismatch in TCP capabilities between two end systems. + For example, the TCP window scaling option [RFC1323] can be used to + extend the maximum amount of TCP data which can be "in flight" (i.e., + sent and awaiting acknowledgement). This is useful for filling a + link which has a high bandwidth*delay product. If one end system is + capable of using scaled TCP windows but the other is not, the end + system which is not capable can set up its connection with a PEP on + its side of the high bandwidth*delay link. The split connection PEP + then sets up a TCP connection with window scaling over the link to + the other end system. + + Split connection TCP implementations can effectively leverage TCP + performance enhancements optimal for a particular link but which + cannot necessarily be employed safely over the global Internet. + + Note that using split connection PEPs does not necessarily exclude + simultaneous use of IP for end-to-end connectivity. If a split + connection is managed per application or per connection and is under + the control of the end user, the user can decide whether a particular + + + +Border, et al. Informational [Page 7] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + TCP connection or application makes use of the split connection PEP + or whether it operates end-to-end. When a PEP is employed on a last + hop link, the end user control is relatively easy to implement. + + In effect, application layer proxies for TCP-based applications are + split connection TCP implementations with end systems using PEPs as a + service related to a particular application. Therefore, all + transport (TCP) layer enhancements that are available with split + connection TCP implementations can also be employed with application + layer PEPs in conjunction with application layer enhancements. + +2.5 Transparency + + Another key characteristic of a PEP is its degree of transparency. + PEPs may operate totally transparently to the end systems, transport + endpoints, and/or applications involved (in a connection), requiring + no modifications to the end systems, transport endpoints, or + applications. + + On the other hand, a PEP implementation may require modifications to + both ends in order to be used. In between, a PEP implementation may + require modifications to only one of the ends involved. Either of + these kind of PEP implementations is non-transparent, at least to the + layer requiring modification. + + It is sometimes useful to think of the degree of transparency of a + PEP implementation at four levels, transparency with respect to the + end systems (network-layer transparent PEP), transparency with + respect to the transport endpoints (transport-layer transparent PEP), + transparency with respect to the applications (application-layer + transparent PEP) and transparency with respect to the users. For + example, a user who subscribes to a satellite Internet access service + may be aware that the satellite terminal is providing a performance + enhancing service even though the TCP/IP stack and the applications + in the user's PC are not aware of the PEP which implements it. + + Note that the issue of transparency is not the same as the issue of + maintaining end-to-end semantics. For example, a PEP implementation + which simply uses a TCP ACK spacing mechanism maintains the end-to- + end semantics of the TCP connection while a split connection TCP PEP + implementation may not. Yet, both can be implemented transparently + to the transport endpoints at both ends. The implications of not + maintaining the end-to-end semantics, in particular the end-to-end + semantics of TCP connections, are discussed in Section 4. + + + + + + + +Border, et al. Informational [Page 8] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +3. PEP Mechanisms + + An obvious key characteristic of a PEP implementation is the + mechanism(s) it uses to improve performance. Some examples of PEP + mechanisms are described in the following subsections. A PEP + implementation might implement more than one of these mechanisms. + +3.1 TCP ACK Handling + + Many TCP PEP implementations are based on TCP ACK manipulation. The + handling of TCP acknowledgments can differ significantly between + different TCP PEP implementations. The following subsections + describe various TCP ACK handling mechanisms. Many implementations + combine some of these mechanisms and possibly employ some additional + mechanisms as well. + +3.1.1 TCP ACK Spacing + + In environments where ACKs tend to bunch together, ACK spacing is + used to smooth out the flow of TCP acknowledgments traversing a link. + This improves performance by eliminating bursts of TCP data segments + that the TCP sender would send due to back-to-back arriving TCP + acknowledgments [BPK97]. + +3.1.2 Local TCP Acknowledgements + + In some PEP implementations, TCP data segments received by the PEP + are locally acknowledged by the PEP. This is very useful over + network paths with a large bandwidth*delay product as it speeds up + TCP slow start and allows the sending TCP to quickly open up its + congestion window. Local (negative) acknowledgments are often also + employed to trigger local (and faster) error recovery on links with + significant error rates. (See Section 3.1.3.) + + Local acknowledgments are automatically employed with split + connection TCP implementations. When local acknowledgments are used, + the burden falls upon the TCP PEP to recover any data which is + dropped after the PEP acknowledges it. + +3.1.3 Local TCP Retransmissions + + A TCP PEP may locally retransmit data segments lost on the path + between the TCP PEP and the receiving end system, thus aiming at + faster recovery from lost data. In order to achieve this the TCP PEP + may use acknowledgments arriving from the end system that receives + the TCP data segments, along with appropriate timeouts, to determine + + + + + +Border, et al. Informational [Page 9] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + when to locally retransmit lost data. TCP PEPs sending local + acknowledgments to the sending end system are required to employ + local retransmissions towards the receiving end system. + + Some PEP implementations perform local retransmissions even though + they do not use local acknowledgments to alter TCP connection + performance. Basic Snoop [SNOOP] is a well know example of such a + PEP implementation. Snoop caches TCP data segments it receives and + forwards and then monitors the end-to-end acknowledgments coming from + the receiving TCP end system for duplicate acknowledgments (DUPACKs). + When DUPACKs are received, Snoop locally retransmits the lost TCP + data segments from its cache, suppressing the DUPACKs flowing to the + sending TCP end system until acknowledgments for new data are + received. The Snoop system also implements an option to employ local + negative acknowledgments to trigger local TCP retransmissions. This + can be achieved, for example, by applying TCP selective + acknowledgments locally on the error-prone link. (See Section 5.3 + for details.) + +3.1.4 TCP ACK Filtering and Reconstruction + + On paths with highly asymmetric bandwidth the TCP ACKs flowing in the + low-speed direction may get congested if the asymmetry ratio is high + enough. The ACK filtering and reconstruction mechanism addresses + this by filtering the ACKs on one side of the link and reconstructing + the deleted ACKs on the other side of the link. The mechanism and + the issue of dealing with TCP ACK congestion with highly asymmetric + links are discussed in detail in [RFC2760] and in [BPK97]. + +3.2 Tunneling + + A Performance Enhancing Proxy may encapsulate messages to carry the + messages across a particular link or to force messages to traverse a + particular path. A PEP at the other end of the encapsulation tunnel + removes the tunnel wrappers before final delivery to the receiving + end system. A tunnel might be used by a distributed split connection + TCP implementation as the means for carrying the connection between + the distributed PEPs. A tunnel might also be used to support forcing + TCP connections which use asymmetric routing to go through the end + points of a distributed PEP implementation. + +3.3 Compression + + Many PEP implementations include support for one or more forms of + compression. In some PEP implementations, compression may even be + the only mechanism used for performance improvement. Compression + reduces the number of bytes which need to be sent across a link. + This is useful in general and can be very important for bandwidth + + + +Border, et al. Informational [Page 10] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + limited links. Benefits of using compression include improved link + efficiency and higher effective link utilization, reduced latency and + improved interactive response time, decreased overhead and reduced + packet loss rate over lossy links. + + Where appropriate, link layer compression is used. TCP and IP header + compression are also frequently used with PEP implementations. + [RFC1144] describes a widely deployed method for compressing TCP + headers. Other header compression algorithms are described in + [RFC2507], [RFC2508] and [RFC2509]. + + Payload compression is also desirable and is increasing in importance + with today's increased emphasis on Internet security. Network (IP) + layer (and above) security mechanisms convert IP payloads into random + bit streams which defeat applicable link layer compression mechanisms + by removing or hiding redundant "information." Therefore, + compression of the payload needs to be applied before security + mechanisms are applied. [RFC2393] defines a framework where common + compression algorithms can be applied to arbitrary IP segment + payloads. However, [RFC2393] compression is not always applicable. + Many types of IP payloads (e.g., images, audio, video and "zipped" + files being transferred) are already compressed. And, when security + mechanisms such as TLS [RFC2246] are applied above the network (IP) + layer, the data is already encrypted (and possibly also compressed), + again removing or hiding any redundancy in the payload. The + resulting additional transport or network layer compression will + compact only headers, which are small, and possibly already covered + by separate compression algorithms of their own. + + With application layer PEPs one can employ application-specific + compression. Typically an application-specific (or content-specific) + compression mechanism is much more efficient than any generic + compression mechanism. For example, a distributed Web PEP + implementation may implement more efficient binary encoding of HTTP + headers, or a PEP can employ lossy compression that reduces the image + quality of online-images on Web pages according to end user + instructions, thus reducing the number of bytes transferred over a + slow link and consequently the response time perceived by the user + [LHKR96]. + +3.4 Handling Periods of Link Disconnection with TCP + + Periods of link disconnection or link outages are very common with + some wireless links. During these periods, a TCP sender does not + receive the expected acknowledgments. Upon expiration of the + retransmit timer, this causes TCP to close its congestion window with + all of the related drawbacks. A TCP PEP may monitor the traffic + coming from the TCP sender towards the TCP receiver behind the + + + +Border, et al. Informational [Page 11] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + disconnected link. The TCP PEP retains the last ACK, so that it can + shut down the TCP sender's window by sending the last ACK with a + window set to zero. Thus, the TCP sender will go into persist mode. + + To make this work in both directions with an integrated TCP PEP + implementation, the TCP receiver behind the disconnected link must be + aware of the current state of the connection and, in the event of a + disconnection, it must be capable of freezing all timers. [M-TCP] + implements such operation. Another possibility is that the + disconnected link is surrounded by a distributed PEP pair. + + In split connection TCP implementations, a period of link + disconnection can easily be hidden from the end host on the other + side of the PEP thus precluding the TCP connection from breaking even + if the period of link disconnection lasts a very long time; if the + TCP PEP cannot forward data due to link disconnection, it stops + receiving data. Normal TCP flow control then prevents the TCP sender + from sending more than the TCP advertised window allowed by the PEP. + Consequently, the PEP and its counterpart behind the disconnected + link can employ a modified TCP version which retains the state and + all unacknowledged data segments across the period of disconnection + and then performs local recovery as the link is reconnected. The + period of link disconnection may or may not be hidden from the + application and user, depending upon what application the user is + using the TCP connection for. + +3.5 Priority-based Multiplexing + + Implementing priority-based multiplexing of data over a slow and + expensive link may significantly improve the performance and + usability of the link for selected applications or connections. + + A user behind a slow link would experience the link more feasible to + use in case of simultaneous data transfers, if urgent data transfers + (e.g., interactive connections) could have shorter response time + (better performance) than less urgent background transfers. If the + interactive connections transmit enough data to keep the slow link + fully utilized, it might be necessary to fully suspend the background + transfers for awhile to ensure timely delivery for the interactive + connections. + + In flight TCP segments of an end-to-end TCP connection (with low + priority) cannot be delayed for a long time. Otherwise, the TCP + timer at the sending end would expire, resulting in suboptimal + performance. However, this kind of operation can be controlled in + conjunction with a split connection TCP PEP by assigning different + priorities for different connections (or applications). A split + connection PEP implementation allows the PEP in an intermediate node + + + +Border, et al. Informational [Page 12] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + to delay the data delivery of a lower-priority TCP flow for an + unlimited period of time by simply rescheduling the order in which it + forwards data of different flows to the destination host behind the + slow link. This does not have a negative impact on the delayed TCP + flow as normal TCP flow control takes care of suspending the flow + between the TCP sender and the PEP, when the PEP is not forwarding + data for the flow, and resumes it once the PEP decides to continue + forwarding data for the flow. This can further be assisted, if the + protocol stacks on both sides of the slow link implement priority + based scheduling of connections. + + With such a PEP implementation, along with user-controlled + priorities, the user can assign higher priority for selected + interactive connection(s) and have much shorter response time for the + selected connection(s), even if there are simultaneous low priority + bulk data transfers which in regular end-to-end operation would + otherwise eat the available bandwidth of the slow link almost + completely. These low priority bulk data transfers would then + proceed nicely during the idle periods of interactive connections, + allowing the user to keep the slow and expensive link (e.g., wireless + WAN) fully utilized. + + Other priority-based mechanisms may be applied on shared wireless + links with more than two terminals. With shared wireless mediums + becoming a weak link in Internet QoS architectures, many may turn to + PEPs to provide extra priority levels across a shared wireless medium + [SHEL00]. These PEPs are distributed on all nodes of the shared + wireless medium. For example, in an 802.11 WLAN this PEP is + implemented in the access point (base station) and each mobile host. + One PEP then uses distributed queuing techniques to coordinate + traffic classes of all nodes. This is also sometimes called subnet + bandwidth management. See [BBKT97] for an example of queuing + techniques which can be used to achieve this. This technique can be + implemented either above or below the IP layer. Priority treatment + can typically be specified either by the user or by marking the + (IPv4) ToS or (IPv6) Traffic Class IP header field. + +3.6 Protocol Booster Mechanisms + + Work in [FMSBMR98] shows a range of other possible PEP mechanisms + called protocol boosters. Some of these mechanisms are specific to + UDP flows. For example, a PEP may apply asymmetrical methods such as + extra UDP error detection. Since the 16 bit UDP checksum is + optional, it is typically not computed. However, for links with + errors, the checksum could be beneficial. This checksum can be added + to outgoing UDP packets by a PEP. + + + + + +Border, et al. Informational [Page 13] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + Symmetrical mechanisms have also been developed. A Forward Erasure + Correction (FZC) mechanism can be used with real-time and multicast + traffic. The encoding PEP adds a parity packet over a block of + packets. Upon reception, the parity is removed and missing data is + regenerated. A jitter control mechanism can be implemented at the + expense of extra latency. A sending PEP can add a timestamp to + outgoing packets. The receiving PEP then delays packets in order to + reproduce the correct interval. + +4. Implications of Using PEPs + + The following sections describe some of the implications of using + Performance Enhancing Proxies. + +4.1 The End-to-end Argument + + As indicated in [RFC1958], the end-to-end argument [SRC84] is one of + the architectural principles of the Internet. The basic argument is + that, as a first principle, certain required end-to-end functions can + only be correctly performed by the end systems themselves. Most of + the potential negative implications associated with using PEPs are + related to the possibility of breaking the end-to-end semantics of + connections. This is one of the main reasons why PEPs are not + recommended for general use. + + As indicated in Section 2.5, not all PEP implementations break the + end-to-end semantics of connections. Correctly designed PEPs do not + attempt to replace any application level end-to-end function, but + only attempt to add performance optimizations to a subpath of the + end-to-end path between the application endpoints. Doing this can be + consistent with the end-to-end argument. However, a user or network + administrator adding a PEP to his network configuration should be + aware of the potential end-to-end implications related to the + mechanisms being used by the particular PEP implementation. + +4.1.1 Security + + In most cases, security applied above the transport layer can be used + with PEPs, especially transport layer PEPs. However, today, only a + limited number of applications include support for the use of + transport (or higher) layer security. Network (IP) layer security + (IPsec) [RFC2401], on the other hand, can generally be used by any + application, transparently to the application. + + + + + + + + +Border, et al. Informational [Page 14] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +4.1.1.1 Security Implications + + The most detrimental negative implication of breaking the end-to-end + semantics of a connection is that it disables end-to-end use of + IPsec. In general, a user or network administrator must choose + between using PEPs and using IPsec. If IPsec is employed end-to-end, + PEPs that are implemented on intermediate nodes in the network cannot + examine the transport or application headers of IP packets because + encryption of IP packets via IPsec's ESP header (in either transport + or tunnel mode) renders the TCP header and payload unintelligible to + the PEPs. Without being able to examine the transport or application + headers, a PEP may not function optimally or at all. + + If a PEP implementation is non-transparent to the users and the users + trust the PEP in the middle, IPsec can be used separately between + each end system and PEP. However, in most cases this is an + undesirable or unacceptable alternative as the end systems cannot + trust PEPs in general. In addition, this is not as secure as end- + to-end security. (For example, the traffic is exposed in the PEP + when it is decrypted to be processed.) And, it can lead to + potentially misleading security level assumptions by the end systems. + If the two end systems negotiate different levels of security with + the PEP, the end system which negotiated the stronger level of + security may not be aware that a lower level of security is being + provided for part of the connection. The PEP could be implemented to + prevent this from happening by being smart enough to force the same + level of security to each end system but this increases the + complexity of the PEP implementation (and still is not as secure as + end-to-end security). + + With a transparent PEP implementation, it is difficult for the end + systems to trust the PEP because they may not be aware of its + existence. Even if the user is aware of the PEP, setting up + acceptable security associations with the PEP while maintaining the + PEP's transparent nature is problematic (if not impossible). + + Note that even when a PEP implementation does not break the end-to- + end semantics of a connection, the PEP implementation may not be able + to function in the presence of IPsec. For example, it is difficult + to do ACK spacing if the PEP cannot reliably determine which IP + packets contain ACKs of interest. In any case, the authors are + currently not aware of any PEP implementations, transparent or non- + transparent, which provide support for end-to-end IPsec, except in a + case where the PEPs are implemented on the end hosts. + + + + + + + +Border, et al. Informational [Page 15] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +4.1.1.2 Security Implication Mitigations + + There are some steps which can be taken to allow the use of IPsec and + PEPs to coexist. If an end user can select the use of IPsec for some + traffic and not for other traffic, PEP processing can be applied to + the traffic sent without IPsec. Of course, the user must then do + without security for this traffic or provide security for the traffic + via other means (for example, by using transport layer security). + However, even when this is possible, significant complexity may need + to be added to the configuration of the end system. + + Another alternative is to implement IPsec between the two PEPs of a + distributed PEP implementation. This at least protects the traffic + between the two PEPs. (The issue of trusting the PEPs does not + change.) In the case where the PEP implementation is not transparent + to the user, (assuming that the user trusts the PEPs,) the user can + configure his end system to use the PEPs as the end points of an + IPsec tunnel. And, an IPsec tunnel could even potentially be used + between the end system and a PEP to protect traffic on this part of + the path. But, all of this adds complexity. And, it still does not + eliminate the risk of the traffic being exposed in the PEP itself as + the traffic is received from one IPsec tunnel, processed and then + forwarded (even if forwarded through another IPsec tunnel). + +4.1.1.3 Security Research Related to PEPs + + There is research underway investigating the possibility of changing + the implementation of IPsec to be more friendly to the use of PEPs. + One approach being actively looked at is the use of multi-layer IP + security. [Zhang00] describes a method which allows TCP headers to + be encrypted as one layer (with the PEPs in the path of the TCP + connections included in the security associations used to encrypt the + TCP headers) while the TCP payload is encrypted end-to-end as a + separate layer. This still involves trusting the PEP, but to a much + lesser extent. However, a drawback to this approach is that it adds + a significant amount of complexity to the IP security implementation. + Given the existing complexity of IPsec, this drawback is a serious + impediment to the standardization of the multi-layer IP security idea + and it is very unlikely that this approach will be adopted as a + standard any time soon. Therefore, relying on this type of approach + will likely involve the use of non-standard protocols (and the + associated risk of doing so). + +4.1.2 Fate Sharing + + Another important aspect of the end-to-end argument is fate sharing. + If a failure occurs in the network, the ability of the connection to + survive the failure depends upon how much state is being maintained + + + +Border, et al. Informational [Page 16] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + on behalf of the connection in the network and whether the state is + self-healing. If no connection specific state resides in the network + or such state is self-healing as in case of regular end-to-end + operation, then a failure in the network will break the connection + only if there is no alternate path through the network between the + end systems. And, if there is no path, both end systems can detect + this. However, if the connection depends upon some state being + stored in the network (e.g., in a PEP), then a failure in the network + (e.g., the node containing a PEP crashes) causes this state to be + lost, forcing the connection to terminate even if an alternate path + through the network exists. + + The importance of this aspect of the end-to-end argument with respect + to PEPs is dependent upon both the PEP implementation and upon the + types of applications being used. Sometimes coincidentally but more + often by design, PEPs are used in environments where there is no + alternate path between the end systems and, therefore, a failure of + the intermediate node containing a PEP would result in the + termination of the connection in any case. And, even when this is + not the case, the risk of losing the connection in the case of + regular end-to-end operation may exist as the connection could break + for some other reason, for example, a long enough link outage of a + last-hop wireless link to the end host. Therefore, users may choose + to accept the risk of a PEP crashing in order to take advantage of + the performance gains offered by the PEP implementation. The + important thing is that accepting the risk should be under the + control of the user (i.e., the user should always have the option to + choose end-to-end operation) and, if the user chooses to use the PEP, + the user should be aware of the implications that a PEP failure has + with respect to the applications being used. + +4.1.3 End-to-end Reliability + + Another aspect of the end-to-end argument is that of acknowledging + the receipt of data end-to-end in order to achieve reliable end-to- + end delivery of data. An application aiming at reliable end-to-end + delivery must implement an end-to-end check and recovery at the + application level. According to the end-to-end argument, this is the + only possibility to correctly implement reliable end-to-end + operation. Otherwise the application violates the end-to-end + argument. This also means that a correctly designed application can + never fully rely on the transport layer (e.g., TCP) or any other + communication subsystem to provide reliable end-to-end delivery. + + First, a TCP connection may break down for some reason and result in + lost data that must be recovered at the application level. Second, + the checksum provided by TCP may be considered inadequate, resulting + in undetected (by TCP) data corruption [Pax99] and requiring an + + + +Border, et al. Informational [Page 17] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + application level check for data corruption. Third, a TCP + acknowledgement only indicates that data was delivered to the TCP + implementation on the other end system. It does not guarantee that + the data was delivered to the application layer on the other end + system. Therefore, a well designed application must use an + application layer acknowledgement to ensure end-to-end delivery of + application layer data. Note that this does not diminish the value + of a reliable transport protocol (i.e., TCP) as such a protocol + allows efficient implementation of several essential functions (e.g., + congestion control) for an application. + + If a PEP implementation acknowledges application data prematurely + (before the PEP receives an application ACK from the other endpoint), + end-to-end reliability cannot be guaranteed. Typically, application + layer PEPs do not acknowledge data prematurely, i.e., the PEP does + not send an application ACK to the sender until it receives an + application ACK from the receiver. And, transport layer PEP + implementations, including TCP PEPs, generally do not interfere with + end-to-end application layer acknowledgments as they let applications + operate end-to-end. However, the user and/or network administrator + employing the PEP must understand how it operates in order to + understand the risks related to end-to-end reliability. + + Some Internet applications do not necessarily operate end-to-end in + their regular operation, thus abandoning any end-to-end reliability + guarantee. For example, Internet email delivery often operates via + relay Mail Transfer Agents, that is, relay Simple Mail Transfer + Protocol (SMTP) servers. An originating MTA (SMTP server) sends the + mail message to a relay MTA that receives the mail message, stores it + in non-volatile storage (e.g., on disk) and then sends an application + level acknowledgement. The relay MTA then takes "full + responsibility" for delivering the mail message to the destination + SMTP server (maybe via another relay MTA); it tries to forward the + message for a relatively long time (typically around 5 days). This + scheme does not give a 100% guarantee of email delivery, but + reliability is considered "good enough". + + An application layer PEP for this kind of an application may + acknowledge application data (e.g., mail message) without essentially + decreasing reliability, as long as the PEP operates according to the + same procedure as the regular proxy (e.g., relay MTA). Again, as + indicated above, the user and/or network administrator employing such + a PEP needs to understand how it operates in order to understand the + reliability risks associated with doing so. + + + + + + + +Border, et al. Informational [Page 18] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +4.1.4 End-to-end Failure Diagnostics + + Another aspect of the end-to-end argument is the ability to support + end-to-end failure diagnostics when problems are encountered. If a + network problem occurs which breaks a connection, the end points of + the connection will detect the failure via timeouts. However, the + existence of a PEP in between the two end points could delay + (sometimes significantly) the detection of the failure by one or both + of the end points. (Of course, some PEPs are intentionally designed + to hide these types of failures as described in Section 3.4.) The + implications of delayed detection of a failed connection depend on + the applications being used. Possibilities range from no impact at + all (or just minor annoyance to the end user) all the way up to + impacting mission critical business functions by delaying switchovers + to alternate communications paths. + + In addition, tools used to debug connection failures may be affected + by the use of a PEP. For example, PING (described in [RFC792] and + [RFC2151]) is often used to test for connectivity. But, because PING + is based on ICMP instead of TCP (i.e., it is implemented using ICMP + Echo and Reply commands at the network layer), it is possible that + the configuration of the network might route PING traffic around the + PEP. Thus, PING could indicate that an end-to-end path exists + between two hosts when it does not actually exist for TCP traffic. + Even when the PING traffic does go through the PEP, the diagnostics + indications provided by the PING traffic are altered. For example, + if the PING traffic goes transparently through the PEP, PING does not + provide any indication that the PEP exists and since the PING traffic + is not being subjected to the same processing as TCP traffic, it may + not necessarily provide an accurate indication of the network delay + being experienced by TCP traffic. On the other hand, if the PEP + terminates the PING and responds to it on behalf of the end host, + then the PING provides information only on the connectivity to the + PEP. Traceroute (also described in [RFC2151]) is similarly affected + by the presence of the PEP. + +4.2 Asymmetric Routing + + Deploying a PEP implementation usually requires that traffic to and + from the end hosts is routed through the intermediate node(s) where + PEPs reside. With some networks, this cannot be accomplished, or it + might require that the intermediate node is located several hops away + from the target link edge which in turn is impractical in many cases + and may result in non-optimal routing. + + + + + + + +Border, et al. Informational [Page 19] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + Note that this restriction does not apply to all PEP implementations. + For example, a PEP which is simply doing ACK spacing only needs to + see one direction of the traffic flow (the direction in which the + ACKs are flowing). ACK spacing can be done without seeing the actual + flow of data. + +4.3 Mobile Hosts + + In environments where a PEP implementation is used to serve mobile + hosts, additional problems may be encountered because PEP related + state information may need to be transferred to a new PEP node during + a handoff. + + When a mobile host moves, it is subject to handovers. If the + intermediate node and home for the serving PEP changes due to + handover, any state information that the PEP maintains and is + required for continuous operation must be transferred to the new + intermediate node to ensure continued operation of the connection. + This requires extra work and overhead and may not be possible to + perform fast enough, especially if the host moves frequently over + cell boundaries of a wireless network. If the mobile host moves to + another IP network, routing to and from the mobile host may need to + be changed to traverse a new PEP node. + + Today, mobility implications with respect to using PEPs are more + significant to W-LAN networks than to W-WAN networks. Currently, a + W-WAN base station typically does not provide the mobile host with + the connection point to the wireline Internet. (A W-WAN base station + may not even have an IP stack.) Instead, the W-WAN network takes + care of mobility with the connection point to the wireline Internet + remaining unchanged while the mobile host moves. Thus, PEP state + handover is not currently required in most W-WAN networks when the + host moves. However, this is generally not true in W-LAN networks + and, even in the case of W-WAN networks, the user and/or network + administrator using a PEP needs to be cognizant of how the W-WAN base + stations and the PEP work in case W-WAN PEP state handoff becomes + necessary in the future. + +4.4 Scalability + + Because a PEP typically processes packet information above the IP + layer, a PEP requires more processing power per packet than a router. + Therefore, PEPs will always be (at least) one step behind routers in + terms of the total throughput they can support. (Processing above + the IP layer is also more difficult to implement in hardware.) In + addition, since most PEP implementations require per connection + state, PEP memory requirements are generally significantly higher + + + + +Border, et al. Informational [Page 20] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + than with a router. Therefore, a PEP implementation may have a limit + on the number of connections which it can support whereas a router + has no such limitation. + + Increased processing power and memory requirements introduce + scalability issues with respect to the use of PEPs. Placement of a + PEP on a high speed link or a link which supports a large number of + connections may require network topology changes beyond just + inserting the PEP into the path of the traffic. For example, if a + PEP can only handle half of the traffic on a link, multiple PEPs may + need to be used in parallel, adding complexity to the network + configuration to divide the traffic between the PEPs. + +4.5 Other Implications of Using PEPs + + This document describes some significant implications with respect to + using Performance Enhancing Proxies. However, the list of + implications provided in this document is not necessarily exhaustive. + Some examples of other potential implications related to using PEPs + include the use of PEPs in multi-homing environments and the use of + PEPs with respect to Quality of Service (QoS) transparency. For + example, there may be potential interaction with the priority-based + multiplexing mechanism described in Section 3.5 and the use of + differentiated services [RFC2475]. Therefore, users and network + administrators who wish to deploy a PEP should look not only at the + implications described in this document but also at the overall + impact (positive and negative) that the PEP will have on their + applications and network infrastructure, both initially and in the + future when new applications are added and/or changes in the network + infrastructure are required. + +5. PEP Environment Examples + + The following sections describe examples of environments where PEP is + currently used to improve performance. The examples are provided to + illustrate the use of the various PEP types and PEP mechanisms + described earlier in the document and to help illustrate the + motivation for their development and use. + +5.1 VSAT Environments + + Today, VSAT networks are implemented with geosynchronous satellites. + VSAT data networks are typically implemented using a star topology. + A large hub earth station is located at the center of the star with + VSATs used at the remote sites of the network. Data is sent from the + hub to the remote sites via an outroute. Data is sent from the + remote sites to the hub via one or more inroutes. VSATs represent an + environment with highly asymmetric links, with an outroute typically + + + +Border, et al. Informational [Page 21] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + much larger than an inroute. (Multiple inroutes can be used with + each outroute but any particular VSAT only has access to a single + inroute at a time, making the link asymmetric.) + + VSAT networks are generally used to implement private networks (i.e., + intranets) for enterprises (e.g., corporations) with geographically + dispersed sites. VSAT networks are rarely, if ever, used to + implement Internet connectivity except at the edge of the Internet + (i.e., as the last hop). Connection to the Internet for the VSAT + network is usually implemented at the VSAT network hub site using + appropriate firewall and (when necessary) NAT [RFC2663] devices. + +5.1.1 VSAT Network Characteristics + + With respect to TCP performance, VSAT networks exhibit the following + subset of the satellite characteristics documented in [RFC2488]: + + Long feedback loops + + Propagation delay from a sender to a receiver in a geosynchronous + satellite network can range from 240 to 280 milliseconds, + depending on where the sending and receiving sites are in the + satellite footprint. This makes the round trip time just due to + propagation delay at least 480 milliseconds. Queueing delay and + delay due to shared channel access methods can sometimes increase + the total delay up to on the order of a few seconds. + + Large bandwidth*delay products + + VSAT networks can support capacity ranging from a few kilobits per + second up to multiple megabits per second. When combined with the + relatively long round trip time, TCP needs to keep a large number + of packets "in flight" in order to fully utilize the satellite + link. + + Asymmetric capacity + + As indicated above, the outroute of a VSAT network is usually + significantly larger than an inroute. Even though multiple + inroutes can be used within a network, a given VSAT can only + access one inroute at a time. Therefore, the incoming (outroute) + and outgoing (inroute) capacity for a VSAT is often very + asymmetric. As outroute capacity has increased in recent years, + ratios of 400 to 1 or greater are becoming more and more common. + With a TCP maximum segment size of 1460 bytes and delayed + acknowledgments [RFC1122] in use, the ratio of IP packet bytes for + data to IP packet bytes for ACKs is only (3000 to 40) 75 to 1. + + + + +Border, et al. Informational [Page 22] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + Thus, inroute capacity for carrying ACKs can have a significant + impact on TCP performance. (The issue of asymmetric link impact + on TCP performance is described in more detail in [BPK97].) + + With respect to the other satellite characteristics listed in + [RFC2488], VSAT networks typically do not suffer from intermittent + connectivity or variable round trip times. Also, VSAT networks + generally include a significant amount of error correction coding. + This makes the bit error rate very low during clear sky conditions, + approaching the bit error rate of a typical terrestrial network. In + severe weather, the bit error rate may increase significantly but + such conditions are rare (when looked at from an overall network + availability point of view) and VSAT networks are generally + engineered to work during these conditions but not to optimize + performance during these conditions. + +5.1.2 VSAT Network PEP Implementations + + Performance Enhancing Proxies implemented for VSAT networks generally + focus on improving throughput (for applications such as FTP and HTTP + web page retrievals). To a lesser degree, PEP implementations also + work to improve interactive response time for small transactions. + + There is not a dominant PEP implementation used with VSAT networks. + Each VSAT network vendor tends to implement their own version of PEP + functionality, integrated with the other features of their VSAT + product. [HNS] and [SPACENET] describe VSAT products with integrated + PEP capabilities. There are also third party PEP implementations + designed to be used with VSAT networks. These products run on nodes + external to the VSAT network at the hub and remote sites. NettGain + [FLASH] and Venturi [FOURELLE] are examples of such products. VSAT + network PEP implementations generally share the following + characteristics: + + - They focus on improving TCP performance; + + - They use an asymmetric distributed implementation; + + - They use a split connection approach with local acknowledgments + and local retransmissions; + + - They support some form of compression to reduce the amount of + bandwidth required (with emphasis on saving inroute bandwidth). + + The key differentiators between VSAT network PEP implementations are: + + - The maximum throughput they attempt to support (mainly a + function of the amount of buffer space they use); + + + +Border, et al. Informational [Page 23] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + - The protocol used over the satellite link. Some implementations + use a modified version of TCP while others use a proprietary + protocol running on top of UDP; + + - The type of compression used. Third party VSAT network PEP + implementations generally focus on application (e.g., HTTP) + specific compression algorithms while PEP implementations + integrated into the VSAT network generally focus on link + specific compression. + + PEP implementations integrated into a VSAT product are generally + transparent to the end systems. Third party PEP implementations used + with VSAT networks usually require configuration changes in the + remote site end systems to route TCP packets to the remote site + proxies but do not require changes to the hub site end systems. In + some cases, the PEP implementation is actually integrated + transparently into the end system node itself, using a "bump in the + stack" approach. In all cases, the use of a PEP is non-transparent + to the user, i.e., the user is aware when a PEP implementation is + being used to boost performance. + +5.1.3 VSAT Network PEP Motivation + + VSAT networks, since the early stages of their deployment, have + supported the use of local termination of a protocol (e.g., SDLC and + X.25) on each side of the satellite link to hide the satellite link + from the applications using the protocol. Therefore, when LAN + capabilities were added to VSAT networks, VSAT customers expected + and, in fact, demanded, the use of similar techniques for improving + the performance of IP based traffic, in particular TCP traffic. + + As indicated in Section 5.1, VSAT networks are primarily used to + implement intranets with Internet connectivity limited to and closely + controlled at the hub site of the VSAT network. Therefore, VSAT + customers are not as affected (or at least perceive that they are not + as affected) by the Internet related implications of using PEPs as + are other technologies. Instead, what is more important to VSAT + customers is the optimization of the network. And, VSAT customers, + in general, prefer that the optimization of the network be done by + the network itself rather than by implementing changes (such as + enabling the TCP scaled window option) to their own equipment. VSAT + customers prefer to optimize their end system configuration for local + communications related to their local mission critical functions and + let the VSAT network hide the presence of the satellite link as much + as possible. VSAT network vendors have also been able to use PEP + functionality to provide value added "services" to their customers + such as extending the useful of life of older equipment which + includes older, "non-modern" TCP stacks. + + + +Border, et al. Informational [Page 24] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + Of course, as the line between intranets and the Internet continues + to fade, the implications of using PEPs start to become more + significant for VSAT networks. For example, twelve years ago + security was not a major concern because the equipment cost related + to being able to intercept VSAT traffic was relatively high. Now, as + technology has advanced, the cost is much less prohibitive. + Therefore, because the use of PEP functionality in VSAT networks + prevents the use of IPsec, customers must rely on the use of higher + layer security mechanisms such as TLS or on proprietary security + mechanisms implemented in the VSAT networks themselves (since + currently many applications are incapable of making (or simply don't + make) use of the standardized higher layer security mechanisms). + This, in turn, affects the cost of the VSAT network as well as + affects the ability of the customers to make use of Internet based + capabilities. + +5.2 W-WAN Environments + + In mobile wireless WAN (W-WAN) environments the wireless link is + typically used as the last-hop link to the end user. W-WANs include + such networks as GSM [GSM], GPRS [GPRS],[BW97], CDPD [CDPD], IS-95 + [CDMA], RichoNet, and PHS. Many of these networks, but not all, have + been designed to provide mobile telephone voice service in the first + place but include data services as well or they evolve from a mobile + telephone network. + +5.2.1 W-WAN Network Characteristics + + W-WAN links typically exhibit some combination of the following link + characteristics: + + - low bandwidth (with some links the available bandwidth might be + as low as a few hundred bits/sec) + + - high latency (minimum round-trip delay close to one second is + not exceptional) + + - high BER resulting in frame or packet losses, or long variable + delays due to local link-layer error recovery + + - some W-WAN links have a lot of internal buffer space which tend + to accumulate data, thus resulting in increased round-trip + delay due to long (and variable) queuing delays + + - on some W-WAN links the users may share common channels for + their data packet delivery which, in turn, may cause unexpected + delays to the packet delivery of a user due to simultaneous use + of the same channel resources by the other users + + + +Border, et al. Informational [Page 25] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + - unexpected link disconnections (or intermittent link outages) + may occur frequently and the period of disconnection may last a + very long time + + - (re)setting the link-connection up may take a long time + (several tens of seconds or even minutes) + + - the W-WAN network typically takes care of terminal mobility: + the connection point to the Internet is retained while the user + moves with the mobile host + + - the use of most W-WAN links is expensive. Many of the service + providers apply time-based charging. + +5.2.2 W-WAN PEP Implementations + + Performance Enhancing Proxies implemented for W-WAN environments + generally focus on improving the interactive response time but at the + same time aim at improving throughput, mainly by reducing the + transfer volume over the inherently slow link in various ways. To + achieve this, typically enhancements are applied at almost all + protocol layers. + +5.2.2.1 Mowgli System + + The Mowgli system [KRA94] is one of the early approaches to address + the challenges induced by the problematic characteristics of low + bandwidth W-WAN links. + + The indirect approach used in Mowgli is not limited to a single layer + as in many other split connection approaches, but it involves all + protocol layers. The basic architecture is based on split TCP (UDP + is also supported) together with full support for application layer + proxies with a distributed PEP approach. An application layer proxy + pair may be added between a client and server, the agent (local + proxy) on a mobile host and the proxy on an intermediate node that + provides the mobile host with the connection to the wireline + Internet. Such a pair may be either explicit or fully transparent to + the applications, but it is, at all times, under end-user control + thus allowing the user to select the traffic that traverses through + the PEP implementation and choose end-to-end IP for other traffic. + + In order to allow running legacy applications unmodified and without + recompilation, the socket layer implementation on the mobile host is + slightly modified to connect the applications, which are configured + to traverse through the PEP, to a local agent while retaining the + original TCP/IP socket semantics. Two types of application layer + agent-proxy pairs can be configured for mobile host application use. + + + +Border, et al. Informational [Page 26] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + A generic pair can be used with any application and it simply + provides split transport service with some optional generic + enhancements like compression. An application-specific pair can be + retailed for any application or a group of applications that are able + to take leverage on the same kind of enhancements. A good example of + enhancements achieved with an application-specific proxy pair is the + Mowgli WWW system that improves significantly the user perceived + response time of Web browsing mainly by reducing the transfer volume + and the number of round trips over the wireless link [LAKLR95], + [LHKR96]. + + Mowgli provides also an option to replace the TCP/IP core protocols + on the last-hop link with a custom protocol that is tuned for low- + bandwidth W-WAN links [KRLKA97]. This protocol was designed to + provide the same transport service with similar semantics as regular + TCP and UDP provide, but use a different protocol implementation that + can freely apply any appropriate protocol mechanisms without being + constrained by the current TCP/IP packet format or protocol + operation. As this protocol is required to operate over a single + logical link only, it could partially combine the protocol control + information and protocol operation of the link, network, and + transport layers. In addition, the protocol can operate on top of + various link services, for example on top of different raw link + services, on top of PPP, on top of IP, or even on top of a single TCP + connection using it as a link service and implementing "TCP + multiplexing" over it. In all other cases, except when the protocol + is configured to operate on top of raw (wireless) link service, IP + may co-exist with the custom protocol allowing simultaneous end-to- + end IP delivery for the traffic not traversing through the PEP + implementation. + + Furthermore, the custom protocol can be run in different operation + modes which turn on or off certain protocol functions depending on + the underlying link service. For example, if the underlying link + service provides reliable data delivery, the checksum and the + window-based error recovery can be turned off, thus reducing the + protocol overhead; only a very simple recovery mechanism is needed to + allow recovery from an unexpected link disconnection. Therefore, the + protocol design was able to use extremely efficient header encoding + (only 1-3 bytes per packet in a typical case), reduce the number of + round trips significantly, and various features that are useful with + low-bandwidth W-WAN links were easy to add. Such features include + suspending the protocol operation over the periods of link + disconnection or link outage together with fast start once the link + becomes operational again, priority-based multiplexing of user data + over the W-WAN link thus offering link capacity to interactive + + + + + +Border, et al. Informational [Page 27] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + applications in a timely manner even in presence of bandwidth- + intensive background transfers, and link-level flow control to + prevent data from accumulating into the W-WAN link internal buffers. + + If desired, regular TCP/IP transport, possibly with corresponding + protocol modifications in TCP (and UDP) that would tune it more + suitable for W-WAN links, can be employed on the last-hop link. + +5.2.2.2 Wireless Application Protocol (WAP) + + The Mowgli system was designed to support mobile hosts that are + attached to the Internet over constrained links, but did not address + the specific challenges with low-end mobile devices. Many mobile + wireless devices are power, memory, and processing constrained, and + the communication links to these devices have lower bandwidth and + less stable connections. These limitations led designers to develop + the Wireless Application Protocol (WAP) that specifies an application + framework and network protocols intended to work across differing + narrowband wireless network technologies bringing Internet content + and advanced data services to low-end digital cellular phones and + other mobile wireless terminals, such as pagers and PDAs. + + The WAP model consists of a WAP client (mobile terminal), a WAP + proxy, and an origin server. It requires a WAP proxy between the WAP + client and the server on the Internet. WAP uses a layered, scalable + architecture [WAPARCH], specifying the following five protocol layers + to be used between the terminal and the proxy: Application Layer + (WAE) [WAPWAE], Session Layer (WSP) [WAPWSP], Transaction Layer (WTP) + [WAPWTP], Security Layer (WTLS) [WAPWTLS], and Transport Layer (WDP) + [WAPWDP]. Standard Internet protocols are used between the proxy and + the origin server. If the origin server includes WAP proxy + functionality, it is called a WAP Server. + + In a typical scenario, a WAP client sends an encoded WAP request to a + WAP proxy. The WAP proxy translates the WAP request into a WWW + (HTTP) request, performing the required protocol conversions, and + submits this request to a standard web server on the Internet. After + the web server responds to the WAP proxy, the response is encoded + into a more compact binary format to decrease the size of the data + over the air. This encoded response is forwarded to the WAP client + [WAPPROXY]. + + WAP operates over a variety of bearer datagram services. When + communicating over these bearer services, the WAP transport layer + (WDP) is always used between the WAP client and WAP proxy and it + provides port addressed datagram service to the higher WAP layers. + If the bearer service supports IP (e.g., GSM-CSD, GSM-GPRS, IS-136, + CDPD), UDP is used as the datagram protocol. However, if the bearer + + + +Border, et al. Informational [Page 28] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + service does not support IP (e.g., GSM-SMS, GSM-USSD, GSM Cell + Broadcast, CDMS-SMS, TETRA-SDS), WDP implements the required datagram + protocol as an adaptation layer between the bearer network and the + protocol stack. + + The use of the other layers depends on the port number. WAP has + registered a set of well-known ports with IANA. The port number + selected by the application for communication between a WAP client + and proxy defines the other layers to be used at each end. The + security layer, WTLS, provides privacy, data integrity and + authentication. Its functionality is similar to TLS 1.0 [RFC2246] + extended with datagram support, optimized handshake and dynamic key + refreshing. If the origin server includes WAP proxy functionality, + it might be used to facilitate the end-to-end security solutions, + otherwise it provides security between the mobile terminal and the + proxy. + + The transaction layer, WTP, is message based without connection + establishment and tear down. It supports three types of transaction + classes: an unconfirmed request (unidirectional), a reliable + (confirmed) request (unidirectional), and a reliable (confirmed) + request-reply transaction. Data is carried in the first packet and + 3-way handshake is eliminated to reduce latencies. In addition + acknowledgments, retransmission, and flow control are provided. It + allows more than one outstanding transaction at a time. It handles + the bearer dependence of a transfer, e.g., selects timeout values and + packet sizes according to the bearer. Unfortunately, WTP uses fixed + retransmission timers and does not include congestion control, which + is a potential problem area as the use of WAP increases [RFC3002]. + + The session layer, WSP, supports binary encoded HTTP 1.1 with some + extensions such as long living session with suspend/resume facility + and state handling, header caching, and push facility. On top of the + architecture is the application environment (WAE). + +5.2.3 W-WAN PEP Motivation + + As indicated in Section 5.2.1, W-WAN networks typically offer very + low bandwidth connections with high latency and relatively frequent + periods of link disconnection and they usually are expensive to use. + Therefore, the transfer volume and extra round-trips, such as those + associated with TCP connection setup and teardown, must be reduced + and the slow W-WAN link should be efficiently shielded from excess + traffic and global (wired) Internet congestion to make Internet + access usable and economical. Furthermore, interactive traffic must + be transmitted in a timely manner even if there are other + simultaneous bandwidth intensive (background) transfers and during + the periods with connectivity the link must be kept fully utilized + + + +Border, et al. Informational [Page 29] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + due to expensive use. In addition, the (long) periods of link + disconnection must not abort active (bulk data) transfers, if an + end-user so desires. + + As (all) applications cannot be made mobility/W-WAN aware in short + time frame or maybe ever, support for mobile W-WAN use should be + implemented in a way which allows most applications, at least those + running on fixed Internet hosts, to continue their operation + unmodified. + +5.3 W-LAN Environments + + Wireless LANs (W-LAN) are typically organized in a cellular topology + where an access point with a W-LAN transceiver controls a single + cell. A cell is defined in terms of the coverage area of the base + station. The access points are directly connected to the wired + network. The access point in each of the cells is responsible for + forwarding packets to and from the hosts located in the cell. Often + the hosts with W-LAN transceivers are mobile. When such a mobile + host moves from one cell to another cell, the responsibility for + forwarding packets between the wired network and the mobile host must + be transferred to the access point of the new cell. This is known as + a handoff. Many W-LAN systems also support an operation mode + enabling ad-hoc networking. In this mode access points are not + necessarily needed, but hosts with W-LAN transceiver can communicate + directly with the other hosts within the transceiver's transmission + range. + +5.3.1 W-LAN Network Characteristics + + Current wireless LANs typically provide link bandwidth from 1 Mbps to + 11 Mbps. In the future, wide deployment of higher bandwidths up to + 54 Mbps or even higher can be expected. The round-trip delay with + wireless LANs is on the order of a few milliseconds or tens of + milliseconds. Examples of W-LANs include IEEE 802.11, HomeRF, and + Hiperlan. Wireless personal area networks (WPAN) such as Bluethooth + can use the same PEP techniques. + + Wireless LANs are error-prone due to bit errors, collisions and link + outages. In addition, consecutive packet losses may also occur + during handoffs. Most W-LAN MAC protocols perform low level + retransmissions. This feature shields upper layers from most losses. + However, unavoidable losses, retransmission latency and link outages + still affect upper layers. TCP performance over W-LANs or a network + path involving a W-LAN link is likely to suffer from these effects. + + + + + + +Border, et al. Informational [Page 30] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + As TCP wrongly interprets these packet losses to be network + congestion, the TCP sender reduces its congestion window and is often + forced to timeout in order to recover from the consecutive losses. + The result is often unacceptably poor end-to-end performance. + +5.3.2 W-LAN PEP Implementations: Snoop + + Berkeley's Snoop protocol [SNOOP] is a TCP-specific approach in which + a TCP-aware module, a Snoop agent, is deployed at the W-LAN base + station that acts as the last-hop router to the mobile host. Snoop + aims at retaining the TCP end-to-end semantics. The Snoop agent + monitors every packet that passes through the base station in either + direction and maintains soft state for each TCP connection. The + Snoop agent is an asymmetric PEP implementation as it operates + differently on TCP data and ACK channels as well as on the uplink + (from the mobile host) and downlink (to the mobile host) TCP + segments. + + For a data transfer to a mobile host, the Snoop agent caches + unacknowledged TCP data segments which it forwards to the TCP + receiver and monitors the corresponding ACKs. It does two things: + + 1. Retransmits any lost data segments locally by using local timers + and TCP duplicate ACKs to identify packet loss, instead of waiting + for the TCP sender to do so end-to-end. + + 2. Suppresses the duplicate ACKs on their way from the mobile host + back to the sender, thus avoiding fast retransmit and congestion + avoidance at the latter. + + Suppressing the duplicate ACKs is required to avoid unnecessary fast + retransmits by the TCP sender as the Snoop agent retransmits a packet + locally. Consider a system that employs the Snoop agent and a TCP + sender S that sends packets to receiver R via a base station BS. + Assume that S sends packets A, B, C, D, E (in that order) which are + forwarded by BS to the wireless receiver R. Assume the first + transmission of packet B is lost due to errors on the wireless link. + In this case, R receives packets A, C, D, E and B (in that order). + Receipt of packets C, D and E trigger duplicate ACKs. When S + receives three duplicate ACKs, it triggers fast retransmit (which + results in a retransmission, as well as reduction of the congestion + window). The Snoop agent also retransmits B locally, when it + receives three duplicate ACKs. The fast retransmit at S occurs + despite the local retransmit on the wireless link, degrading + throughput. Snoop deals with this problem by dropping TCP duplicate + ACKs appropriately at BS. + + + + + +Border, et al. Informational [Page 31] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + For a data transfer from a mobile host, the Snoop agent detects the + packet losses on the wireless link by monitoring the data segments it + forwards. It then employs either Negative Acknowledgements (NAK) + locally or Explicit Loss Notifications (ELN) to inform the mobile + sender that the packet loss was not related to congestion, thus + allowing the sender to retransmit without triggering normal + congestion control procedures. To implement this, changes at the + mobile host are required. + + When a Snoop agent uses NAKs to inform the TCP sender of the packet + losses on the wireless link, one possibility to implement them is + using the Selective Acknowledgment (SACK) option of TCP [RFC2018]. + This requires enabling SACK processing at the mobile host. The Snoop + agent sends a TCP SACK, when it detects a hole in the transmission + sequence from the mobile host or when it has not received any new + packets from the mobile host for a certain time period. This + approach relies on the advisory nature of the SACKs: the mobile + sender is advised to retransmit the missing segments indicated by + SACK, but it must not assume successful end-to-end delivery of the + segments acknowledged with SACK as these segments might get lost + later in the path to the receiver. Instead, the sender must wait for + a cumulative ACK to arrive. + + When the ELN mechanism is used to inform the mobile sender of the + packet losses, Snoop uses one of the 'unreserved' bits in the TCP + header for ELN [SNOOPELN]. The Snoop agent keeps track of the holes + that correspond to segments lost over the wireless link. When a + (duplicate) ACK corresponding to a hole in the sequence space arrives + from the TCP receiver, the Snoop agent sets the ELN bit on the ACK to + indicate that the loss is unrelated to congestion and then forwards + the ACK to the TCP sender. When the sender receives a certain number + of (duplicate) ACKs with ELN (a configurable variable at the mobile + host, e.g., two), it retransmit the missing segment without + performing any congestion control measures. + + The ELN mechanism using one of the six bits reserved for future use + in the TCP header is dangerous as it exercises checks that might not + be correctly implemented in TCP stacks, and may expose bugs. + + A scheme such as Snoop is needed only if the possibility of a fast + retransmit due to wireless errors is non-negligible. In particular, + if the wireless link uses link-layer recovery for lost data, then + this scheme is not beneficial. Also, if the TCP window tends to stay + smaller than four segments, for example, due to congestion related + losses on the wired network, the probability that the Snoop agent + will have an opportunity to locally retransmit a lost packet is + small. This is because at least three duplicate ACKs are needed to + trigger the local retransmission, but due to small window the Snoop + + + +Border, et al. Informational [Page 32] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + agent may not be able to forward three new packets after the lost + packet and thus induce the required three duplicate ACKs. + Conversely, when the TCP window is large enough, Snoop can provide + significant performance improvement (compared with standard TCP). + + In order to alleviate the problem with small TCP windows, Snoop + proposes a solution in which a TCP sender is allowed to transmit a + new data segment for each duplicate ACK it receives as long as the + number of duplicate ACKs is less than the threshold for TCP fast + retransmission (three duplicate ACKs). If the new segment reaches + the receiver, it will generate another duplicate ACK which, in turn, + allows the sender to transmit yet another data segment. This + continues until enough duplicate ACKs have accumulated to trigger TCP + fast retransmission. This proposal is the same as the "Limited + Transfer" proposal [RFC3042] that has recently been forwarded to the + standards track. However, to be able to benefit from this solution, + it needs to be deployed on TCP senders and therefore it is not ready + for use in a short time frame. + + Snoop requires the intermediate node (base station) to examine and + operate on the traffic between the mobile host and the other end host + on the wired Internet. Hence, Snoop does not work if the IP traffic + is encrypted. Possible solutions involve: + + - making the Snoop agent a party to the security association + between the client and the server; + + - IPsec tunneling mode, terminated at the Snooping base station. + + However, these techniques require that users trust base stations. + + Snoop also requires that both the data and the corresponding ACKs + traverse the same base station. Furthermore, the Snoop agent may + duplicate efforts by the link layer as it retransmits the TCP data + segments "at the transport layer" across the wireless link. (Snoop + has been described by its designers as a TCP-aware link layer. This + is the right approach: the link and network layers can be much more + aware of each other than strict layering suggests.) + +5.3.3 W-LAN PEP Motivation + + Wireless LANs suffer from an error prone wireless channel. Errors + can typically be considered bursty and channel conditions may change + rapidly from mobility and environmental changes. Packets are dropped + from bit errors or during handovers. Periods of link outage can also + be experienced. Although the typical MAC performs retransmissions, + dropped packets, outages and retransmission latency still can have + serious performance implications for IP performance, especially TCP. + + + +Border, et al. Informational [Page 33] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + PEPs can be used to alleviate problems caused by packet losses, + protect TCP from link outages, and to add priority multiplexing. + Techniques such as Snoop are integrally implemented in access points, + while priority and compression schemes are distributed across the W- + LAN. + +6. Security Considerations + + The use of Performance Enhancing Proxies introduces several issues + which impact security. First, (as described in detail in Section + 4.1.1,) using PEPs and using IPsec is generally mutually exclusive. + Unless the PEP is also both capable and trusted to be the endpoint of + an IPsec tunnel (and the use of an IPsec tunnel is deemed good enough + security for the applicable threat model), a user or network + administrator must choose between improved performance and network + layer security. In some cases, transport (or higher) layer security + can be used in conjunction with a PEP to mitigate the impact of not + having network layer security. But, support by applications for the + use of transport (or higher) layer security is far from ubiquitous. + + Additionally, the PEP itself needs to be protected from attack. + First, even when IPsec tunnels are used with the PEP, the PEP + represents a point in the network where traffic is exposed. And, the + placement of a PEP in the network makes it an ideal platform from + which to launch a denial of service or man in the middle attack. + (Also, taking the PEP out of action is a potential denial of service + attack itself.) Therefore, the PEP must be protected (e.g., by a + firewall) or must protect itself from improper access by an attacker + just like any other device which resides in a network. + +7. IANA Considerations + + This document is an informational overview document and, as such, + does not introduce new nor modify existing name or number spaces + managed by IANA. + +8. Acknowledgements + + This document grew out of the Internet-Draft "TCP Performance + Enhancing Proxy Terminology", RFC 2757 "Long Thin Networks", and work + done in the IETF TCPSAT working group. The authors are indebted to + the active members of the PILC working group. In particular, Joe + Touch and Mark Allman gave us invaluable feedback on various aspects + of the document and Magdolna Gerendai provided us with essential help + on the WAP example. + + + + + + +Border, et al. Informational [Page 34] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +9. References + + [BBKT97] P. Bhagwat, P. Bhattacharya, A. Krishma, S.K. Tripathi, + "Using channel state dependent packet scheduling to + improve TCP throughput over wireless LANs," ACM Wireless + Networks, March 1997, pp. 91 - 102. Available at: + http://www.acm.org/pubs + /articles/journals/wireless/1997-3-1/p91-bhagwat/p91- + bhagwat.pdf + + [BPK97] H. Balakrishnan, V.N. Padmanabhan, R.H. Katz, "The + Effects of Asymmetry on TCP Performance," Proc. ACM/IEEE + Mobicom, Budapest, Hungary, September 1997. + + [BW97] G. Brasche, B. Walke, "Concepts, Services, and Protocols + of the New GSM Phase 2+ general Packet Radio Service," + IEEE Communications Magazine, Vol. 35, No. 8, August + 1997. + + [CDMA] Electronic Industry Alliance (EIA)/Telecommunications + Industry Association (TIA), IS-95: Mobile Station-Base + Station Compatibility Standard for Dual-Mode Wideband + Spread Spectrum Cellular System, 1993. + + [CDPD] Wireless Data Forum, CDPD System Specification, Release + 1.1, 1995. + + [CTC+97] H. Chang, C. Tait, N. Cohen, M. Shapiro, S. Mastrianni, + R. Floyd, B. Housel, D. Lindquist, "Web Browsing in a + Wireless Environment: Disconnected and Asynchronous + Operation in ARTour Web Express," Proc. MobiCom'97, + Budapest, Hungary, September 1997. + + [FMSBMR98] D.C. Feldmeier, A.J. McAuley, J.M. Smith, D.S. Bakin, + W.S. Marcus, T.M. Raleigh, "Protocol Boosters," IEEE + Journal on Selected Areas of Communication, Vol. 16, No. + 3, April 1998. + + [FLASH] Flash Networks Ltd., performance boosting products + technology vendor based in Holmdel, New Jersey. Website + at http://www.flashnetworks.com. + + [FOURELLE] Fourelle Systems, performance boosting products + technology vendor based in Santa Clara, California. + Website at http://www.fourelle.com. + + [GPRS] ETSI, "General Packet Radio Service (GPRS): Service + Description, Stage 2," GSM03.60, v.6.1.1, August 1998. + + + +Border, et al. Informational [Page 35] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + [GSM] M. Rahnema, "Overview of the GSM system and protocol + architecture," IEEE Communications Magazine, Vol. 31, No. + 4, pp. 92-100, April 1993. + + [HNS] Hughes Network Systems, Inc., VSAT technology vendor + based in Germantown, Maryland. Website at + http://www.hns.com. + + [I-TCP] A. Bakre, B.R. Badrinath, "I-TCP: Indirect TCP for Mobile + Hosts," Proc. 15th International Conference on + Distributed Computing Systems (ICDCS), May 1995. + + [KRA94] M. Kojo, K. Raatikainen, T. Alanko, "Connecting Mobile + Workstations to the Internet over a Digital Cellular + Telephone Network," Proc. Workshop on Mobile and Wireless + Information Systems (MOBIDATA), Rutgers University, NJ, + November 1994. Revised version published in Mobile + Computing, pp. 253-270, Kluwer, 1996. + + [KRLKA97] M. Kojo, K. Raatikainen, M. Liljeberg, J. Kiiskinen, T. + Alanko, "An Efficient Transport Service for Slow Wireless + Telephone Links," IEEE Journal on Selected Areas of + Communication, Vol. 15, No. 7, September 1997. + + [LAKLR95] M. Liljeberg, T. Alanko, M. Kojo, H. Laamanen, K. + Raatikainen, "Optimizing World-Wide Web for Weakly- + Connected Mobile Workstations: An Indirect Approach," + Proc. of the 2nd Int. Workshop on Services in Distributed + and Networked Environments, Whistler, Canada, pp. 132- + 139, June 1995. + + [LHKR96] M. Liljeberg, H. Helin, M. Kojo, K. Raatikainen, "Mowgli + WWW Software: Improved Usability of WWW in Mobile WAN + Environments," Proc. IEEE Global Internet 1996 + Conference, London, UK, November 1996. + + [M-TCP] K. Brown, S. Singh, "M-TCP: TCP for Mobile Cellular + Networks," ACM Computer Communications Review Volume + 27(5), 1997. Available at + ftp://ftp.ece.orst.edu/pub/singh/papers/mtcp.ps.gz. + + [Pax99] V. Paxson, "End-to-End Internet Packet Dynamics," + IEEE/ACM Transactions on Networking, Vol. 7, No. 3, 1999, + pp. 277-292. + + [PILCWEB] http://pilc.grc.nasa.gov. + + + + + +Border, et al. Informational [Page 36] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + [RFC0792] Postel, J., "Internet Control Message Protocol", STD 5, + RFC 792, September 1981. + + [RFC0793] Postel, J., "Transmission Control Protocol", STD 7, RFC + 793, September 1981. + + [RFC1122] Braden, R., "Requirements for Internet Hosts -- + Communications Layers", STD 3, RFC 1122, October 1989. + + [RFC1144] Jacobson, V., "Compressing TCP/IP Headers for Low-Speed + Serial Links", RFC 1144, February 1990. + + [RFC1323] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions + for High Performance", RFC 1323, May 1992. + + [RFC1958] Carpenter, B., "Architectural Principles of the + Internet", RFC 1958, June 1996. + + [RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP + Selective Acknowledgment Options", RFC 2018, October + 1996. + + [RFC2151] Kessler, G. and S. Shepard, "A Primer On Internet and + TCP/IP Tools and Utilities", FYI 30, RFC 2151, June 1997. + + [RFC2246] Dierk, T. and E. Allen, "TLS Protocol Version 1," RFC + 2246, January 1999. + + [RFC2393] Shacham, A., Monsour, R., Pereira, R. and M. Thomas, "IP + Payload Compression Protocol (IPcomp)", RFC 2393, + December 1998. + + [RFC2401] Kent, S., and R. Atkinson, "Security Architecture for the + Internet Protocol", RFC 2401, November 1998. + + [RFC2475] Blake, S., Black, D., Carlson, M., Davies, E., Wang, Z. + and W. Weiss, "An Architecture for Differentiated + Services", RFC 2475, December 1998. + + [RFC2488] Allman, M., Glover, D. and L. Sanchez, "Enhancing TCP + Over Satellite Channels using Standard Mechanisms", BCP + 28, RFC 2488, January 1999. + + [RFC2507] Degermark, M., Nordgren, B. and S. Pink, "IP Header + Compression", RFC 2507, February 1999. + + + + + + +Border, et al. Informational [Page 37] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + [RFC2508] Casner, S. and V. Jacobson, "Compressing IP/UDP/RTP + Headers for Low-Speed Serial Links", RFC 2508, February + 1999. + + [RFC2509] Engan, M., Casner, S. and C. Bormann, "IP Header + Compression over PPP", RFC 2509, February 1999. + + [RFC2663] Srisuresh, P. and Y. Holdrege, "IP Network Address + Translator (NAT) Terminology and Considerations", RFC + 2663, August 1999. + + [RFC2760] Allman, M., Dawkins, S., Glover, D., Griner, J., + Henderson, T., Heidemann, J., Kruse, H., Ostermann, S., + Scott, K., Semke, J., Touch, J. and D. Tran, "Ongoing TCP + Research Related to Satellites", RFC 2760, February 2000. + + [RFC3002] Mitzel, D., "Overview of 2000 IAB Wireless + Internetworking Workshop", RFC 3002, December 2000. + + [RFC3042] Allman, M., Balakrishnan, H. and S. Floyd, "Enhancing + TCP's Loss Recovery Using Limited Transmit", RFC 3042, + January 2001. + + [SHEL00] Z. Shelby, T. Saarinen, P. Mahonen, D. Melpignano, A. + Marshall, L. Munoz, "Wireless IPv6 Networks - WINE," IST + Mobile Summit, Ireland, October 2000. + + [SNOOP] H. Balakrishnan, S. Seshan, E. Amir, R. Katz, "Improving + TCP/IP Performance over Wireless Networks," Proc. 1st ACM + Conference on Mobile Communications and Networking + (Mobicom), Berkeley, California, November 1995. + + [SNOOPELN] H. Balakrishnan, R. Katz, "Explicit Loss Notification and + Wireless Web Performance," Proc. IEEE Globecom 1998, + Internet Mini-Conference, Sydney, Australia, November + 1998. + + [SPACENET] Spacenet, VSAT technology vendor based in Mclean, + Virginia. Website at http://www.spacenet.com. + + [SRC84] J.H. Saltzer, D.P. Reed, D.D. Clark, "End-To-End + Arguments in System Design," ACM TOCS, Vol. 2, No. 4, pp. + 277-288, November 1984. + + [WAPARCH] Wireless Application Protocol Architecture Specification, + April 1998, http://www.wapforum.org. + + + + + +Border, et al. Informational [Page 38] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + [WAPPROXY] Wireless Application Protocol Push Proxy Gateway Service + Specification, August 1999, http://www.wapforum.org. + + [WAPWAE] Wireless Application Protocol Wireless Application + Environment Overview, March 2000, + http://www.wapforum.org. + + [WAPWDP] Wireless Application Protocol Wireless Datagram Protocol + Specification, February 2000, http://www.wapforum.org. + + [WAPWSP] Wireless Application Protocol Wireless Session Protocol + Specification, May 2000, http://www.wapforum.org. + + [WAPWTLS] Wireless Application Protocol Wireless Transport Layer + Security Specification, February 2000, + http://www.wapforum.org. + + [WAPWTP] Wireless Application Protocol Wireless Transaction + Protocol Specification, February 2000, + http://www.wapforum.org. + + [Zhang00] Y. Zhang, B. Singh, "A Multi-Layer IPsec Protocol," Proc. + proceedings of 9th USENIX Security Symposium, Denver, + Colorado, August 2000. Available at + http://www.wins.hrl.com/people/ygz/papers/usenix00.html. + +10. Authors' Addresses + + Questions about this document may be directed to: + + John Border + Hughes Network Systems + 11717 Exploration Lane + Germantown, Maryland 20876 + + Phone: +1-301-548-6819 + Fax: +1-301-548-1196 + EMail: border@hns.com + + + + + + + + + + + + + +Border, et al. Informational [Page 39] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + Markku Kojo + Department of Computer Science + University of Helsinki + P.O. Box 26 (Teollisuuskatu 23) + FIN-00014 HELSINKI + Finland + + Phone: +358-9-1914-4179 + Fax: +358-9-1914-4441 + EMail: kojo@cs.helsinki.fi + + + Jim Griner + NASA Glenn Research Center + MS: 54-5 + 21000 Brookpark Orad + Cleveland, Ohio 44135-3191 + + Phone: +1-216-433-5787 + Fax: +1-216-433-8705 + EMail: jgriner@grc.nasa.gov + + + Gabriel Montenegro + Sun Microsystems Laboratories, Europe + 29, chemin du Vieux Chene + 38240 Meylan, FRANCE + + Phone: +33 476 18 80 45 + EMail: gab@sun.com + + + Zach Shelby + University of Oulu + Center for Wireless Communications + PO Box 4500 + FIN-90014 + Finland + + Phone: +358-40-779-6297 + EMail: zach.shelby@ee.oulu.fi + + + + + + + + + + +Border, et al. Informational [Page 40] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +Appendix A - PEP Terminology Summary + + This appendix provides a summary of terminology frequently used + during discussion of Performance Enhancing Proxies. (In some cases, + these terms have different meanings from their non-PEP related + usage.) + + ACK filtering + + Removing acknowledgments to prevent congestion of a low speed + link, usually used with paths which include a highly asymmetric + link. Sometimes also called ACK reduction. See Section 3.1.4. + + ACK spacing + + Delayed forwarding of acknowledgments in order to space them + appropriately, for example, to help minimize the burstiness of + TCP data. See Section 3.1.1. + + application layer PEP + + A Performance Enhancing Proxy operating above the transport + layer. May be aimed at improving application or transport + protocol performance (or both). Described in detail in Section + 2.1.2. + + asymmetric link + + A link which has different rates for the forward channel (used for + data segments) and the back (or return) channel (used for ACKs). + + available bandwidth + + The total capacity of a link available to carry information at any + given time. May be lower than the raw bandwidth due to competing + traffic. + + bandwidth utilization + + The actual amount of information delivered over a link in a given + period, usually expressed as a percent of the raw bandwidth of + the link. + + gateway + + Has several meanings with respect to PEPs, depending on context: + + - An access point to a particular link; + + + +Border, et al. Informational [Page 41] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + - A device capable of initiating and terminating connections + on + + behalf of a user or end system (e.g., a firewall or proxy). + + Not necessarily, but could be, a router. + + in flight (data) + + Data sent but not yet acknowledged. More precisely, data sent for + which the sender has not yet received the acknowledgement. + + link layer PEP + + A Performance Enhancing Proxy operating below the network layer. + + local acknowledgement + + The generation of acknowledgments by an entity in the path + between two end systems in order to allow the sending system to + transmit more data without waiting for end-to-end + acknowledgments. Described (in the context of TCP) in Section + 3.1.2. + + performance enhancing proxy + + An entity in the network acting on behalf of an end system or user + (with or without the knowledge of the end system or user) in order + to enhance protocol performance. Section 2 describes various + types of performance enhancing proxies. Section 3 describes the + mechanisms performance enhancing proxies use to improve + performance. + + raw bandwidth + + The total capacity of an unloaded link available to carry + information. + + Snoop + + A TCP-aware link layer developed for wireless packet radio and + cellular networks. It works by caching segments at a wireless + base station. If the base station sees duplicate acknowledgments + for a segment that it has cached, it retransmits the missing + segment while suppressing the duplicate acknowledgement stream + being forwarded back to the sender until the wireless receiver + starts to acknowledge new data. Described in detail in Section + 5.3.2 and [SNOOP]. + + + +Border, et al. Informational [Page 42] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + split connection + + A connection that has been terminated before reaching the intended + destination end system in order to initiate another connection + towards the end system. This allows the use of different + connection characteristics for different parts of the path of + the originally intended connection. See Section 2.4. + + TCP PEP + + A Performance Enhancing Proxy operating at the transport layer + with TCP. Aimed at improving TCP performance. + + TCP splitting + + Using one or more split TCP connections to improve TCP + performance. + + TCP spoofing + + Sometimes used as a synonym for TCP PEP. More accurately, TCP + spoofing refers to using transparent (to the TCP stacks in the + end systems) mechanisms to improve TCP performance. See Section + 2.1.1. + + transparent + + In the context of a PEP, transparent refers to not requiring + changes to be made to the end systems, transport endpoints + and/or applications involved in a connection. See Section 2.5 + for a more detailed explanation. + + transport layer PEP + + A Performance Enhancing Proxy operating at the transport layer. + Described in detail in Section 2.1.1. + + tunneling + + In the context of PEPs, tunneling refers to the process of + wrapping a packet for transmission over a particular link + between two PEPs. See Section 3.2. + + + + + + + + + +Border, et al. Informational [Page 43] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + + WAP + + The Wireless Application Protocol specifies an application + framework and network protocols intended to work across + differing narrow-band wireless network technologies. See + Section 5.2.2.2. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Border, et al. Informational [Page 44] + +RFC 3135 PILC - Performance Enhancing Proxies June 2001 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2001). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + + + + + + + + + + + +Border, et al. Informational [Page 45] + diff --git a/ext/picotcp/RFC/rfc3168.txt b/ext/picotcp/RFC/rfc3168.txt new file mode 100644 index 0000000..30b05f7 --- /dev/null +++ b/ext/picotcp/RFC/rfc3168.txt @@ -0,0 +1,3531 @@ + + + + + + +Network Working Group K. Ramakrishnan +Request for Comments: 3168 TeraOptic Networks +Updates: 2474, 2401, 793 S. Floyd +Obsoletes: 2481 ACIRI +Category: Standards Track D. Black + EMC + September 2001 + + + The Addition of Explicit Congestion Notification (ECN) to IP + +Status of this Memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2001). All Rights Reserved. + +Abstract + + This memo specifies the incorporation of ECN (Explicit Congestion + Notification) to TCP and IP, including ECN's use of two bits in the + IP header. + +Table of Contents + + 1. Introduction.................................................. 3 + 2. Conventions and Acronyms...................................... 5 + 3. Assumptions and General Principles............................ 5 + 4. Active Queue Management (AQM)................................. 6 + 5. Explicit Congestion Notification in IP........................ 6 + 5.1. ECN as an Indication of Persistent Congestion............... 10 + 5.2. Dropped or Corrupted Packets................................ 11 + 5.3. Fragmentation............................................... 11 + 6. Support from the Transport Protocol........................... 12 + 6.1. TCP......................................................... 13 + 6.1.1 TCP Initialization......................................... 14 + 6.1.1.1. Middlebox Issues........................................ 16 + 6.1.1.2. Robust TCP Initialization with an Echoed Reserved Field. 17 + 6.1.2. The TCP Sender............................................ 18 + 6.1.3. The TCP Receiver.......................................... 19 + 6.1.4. Congestion on the ACK-path................................ 20 + 6.1.5. Retransmitted TCP packets................................. 20 + + + +Ramakrishnan, et al. Standards Track [Page 1] + +RFC 3168 The Addition of ECN to IP September 2001 + + + 6.1.6. TCP Window Probes......................................... 22 + 7. Non-compliance by the End Nodes............................... 22 + 8. Non-compliance in the Network................................. 24 + 8.1. Complications Introduced by Split Paths..................... 25 + 9. Encapsulated Packets.......................................... 25 + 9.1. IP packets encapsulated in IP............................... 25 + 9.1.1. The Limited-functionality and Full-functionality Options.. 27 + 9.1.2. Changes to the ECN Field within an IP Tunnel.............. 28 + 9.2. IPsec Tunnels............................................... 29 + 9.2.1. Negotiation between Tunnel Endpoints...................... 31 + 9.2.1.1. ECN Tunnel Security Association Database Field.......... 32 + 9.2.1.2. ECN Tunnel Security Association Attribute............... 32 + 9.2.1.3. Changes to IPsec Tunnel Header Processing............... 33 + 9.2.2. Changes to the ECN Field within an IPsec Tunnel........... 35 + 9.2.3. Comments for IPsec Support................................ 35 + 9.3. IP packets encapsulated in non-IP Packet Headers............ 36 + 10. Issues Raised by Monitoring and Policing Devices............. 36 + 11. Evaluations of ECN........................................... 37 + 11.1. Related Work Evaluating ECN................................ 37 + 11.2. A Discussion of the ECN nonce.............................. 37 + 11.2.1. The Incremental Deployment of ECT(1) in Routers.......... 38 + 12. Summary of changes required in IP and TCP.................... 38 + 13. Conclusions.................................................. 40 + 14. Acknowledgements............................................. 41 + 15. References................................................... 41 + 16. Security Considerations...................................... 45 + 17. IPv4 Header Checksum Recalculation........................... 45 + 18. Possible Changes to the ECN Field in the Network............. 45 + 18.1. Possible Changes to the IP Header.......................... 46 + 18.1.1. Erasing the Congestion Indication........................ 46 + 18.1.2. Falsely Reporting Congestion............................. 47 + 18.1.3. Disabling ECN-Capability................................. 47 + 18.1.4. Falsely Indicating ECN-Capability........................ 47 + 18.2. Information carried in the Transport Header................ 48 + 18.3. Split Paths................................................ 49 + 19. Implications of Subverting End-to-End Congestion Control..... 50 + 19.1. Implications for the Network and for Competing Flows....... 50 + 19.2. Implications for the Subverted Flow........................ 53 + 19.3. Non-ECN-Based Methods of Subverting End-to-end Congestion + Control.................................................... 54 + 20. The Motivation for the ECT Codepoints........................ 54 + 20.1. The Motivation for an ECT Codepoint........................ 54 + 20.2. The Motivation for two ECT Codepoints...................... 55 + 21. Why use Two Bits in the IP Header?........................... 57 + 22. Historical Definitions for the IPv4 TOS Octet................ 58 + 23. IANA Considerations.......................................... 60 + 23.1. IPv4 TOS Byte and IPv6 Traffic Class Octet................. 60 + 23.2. TCP Header Flags........................................... 61 + + + +Ramakrishnan, et al. Standards Track [Page 2] + +RFC 3168 The Addition of ECN to IP September 2001 + + + 23.3. IPSEC Security Association Attributes....................... 62 + 24. Authors' Addresses........................................... 62 + 25. Full Copyright Statement..................................... 63 + +1. Introduction + + We begin by describing TCP's use of packet drops as an indication of + congestion. Next we explain that with the addition of active queue + management (e.g., RED) to the Internet infrastructure, where routers + detect congestion before the queue overflows, routers are no longer + limited to packet drops as an indication of congestion. Routers can + instead set the Congestion Experienced (CE) codepoint in the IP + header of packets from ECN-capable transports. We describe when the + CE codepoint is to be set in routers, and describe modifications + needed to TCP to make it ECN-capable. Modifications to other + transport protocols (e.g., unreliable unicast or multicast, reliable + multicast, other reliable unicast transport protocols) could be + considered as those protocols are developed and advance through the + standards process. We also describe in this document the issues + involving the use of ECN within IP tunnels, and within IPsec tunnels + in particular. + + One of the guiding principles for this document is that, to the + extent possible, the mechanisms specified here be incrementally + deployable. One challenge to the principle of incremental deployment + has been the prior existence of some IP tunnels that were not + compatible with the use of ECN. As ECN becomes deployed, non- + compatible IP tunnels will have to be upgraded to conform to this + document. + + This document obsoletes RFC 2481, "A Proposal to add Explicit + Congestion Notification (ECN) to IP", which defined ECN as an + Experimental Protocol for the Internet Community. This document also + updates RFC 2474, "Definition of the Differentiated Services Field + (DS Field) in the IPv4 and IPv6 Headers", in defining the ECN field + in the IP header, RFC 2401, "Security Architecture for the Internet + Protocol" to change the handling of IPv4 TOS Byte and IPv6 Traffic + Class Octet in tunnel mode header construction to be compatible with + the use of ECN, and RFC 793, "Transmission Control Protocol", in + defining two new flags in the TCP header. + + TCP's congestion control and avoidance algorithms are based on the + notion that the network is a black-box [Jacobson88, Jacobson90]. The + network's state of congestion or otherwise is determined by end- + systems probing for the network state, by gradually increasing the + load on the network (by increasing the window of packets that are + outstanding in the network) until the network becomes congested and a + packet is lost. Treating the network as a "black-box" and treating + + + +Ramakrishnan, et al. Standards Track [Page 3] + +RFC 3168 The Addition of ECN to IP September 2001 + + + loss as an indication of congestion in the network is appropriate for + pure best-effort data carried by TCP, with little or no sensitivity + to delay or loss of individual packets. In addition, TCP's + congestion management algorithms have techniques built-in (such as + Fast Retransmit and Fast Recovery) to minimize the impact of losses, + from a throughput perspective. However, these mechanisms are not + intended to help applications that are in fact sensitive to the delay + or loss of one or more individual packets. Interactive traffic such + as telnet, web-browsing, and transfer of audio and video data can be + sensitive to packet losses (especially when using an unreliable data + delivery transport such as UDP) or to the increased latency of the + packet caused by the need to retransmit the packet after a loss (with + the reliable data delivery semantics provided by TCP). + + Since TCP determines the appropriate congestion window to use by + gradually increasing the window size until it experiences a dropped + packet, this causes the queues at the bottleneck router to build up. + With most packet drop policies at the router that are not sensitive + to the load placed by each individual flow (e.g., tail-drop on queue + overflow), this means that some of the packets of latency-sensitive + flows may be dropped. In addition, such drop policies lead to + synchronization of loss across multiple flows. + + Active queue management mechanisms detect congestion before the queue + overflows, and provide an indication of this congestion to the end + nodes. Thus, active queue management can reduce unnecessary queuing + delay for all traffic sharing that queue. The advantages of active + queue management are discussed in RFC 2309 [RFC2309]. Active queue + management avoids some of the bad properties of dropping on queue + overflow, including the undesirable synchronization of loss across + multiple flows. More importantly, active queue management means that + transport protocols with mechanisms for congestion control (e.g., + TCP) do not have to rely on buffer overflow as the only indication of + congestion. + + Active queue management mechanisms may use one of several methods for + indicating congestion to end-nodes. One is to use packet drops, as is + currently done. However, active queue management allows the router to + separate policies of queuing or dropping packets from the policies + for indicating congestion. Thus, active queue management allows + routers to use the Congestion Experienced (CE) codepoint in a packet + header as an indication of congestion, instead of relying solely on + packet drops. This has the potential of reducing the impact of loss + on latency-sensitive flows. + + + + + + + +Ramakrishnan, et al. Standards Track [Page 4] + +RFC 3168 The Addition of ECN to IP September 2001 + + + There exist some middleboxes (firewalls, load balancers, or intrusion + detection systems) in the Internet that either drop a TCP SYN packet + configured to negotiate ECN, or respond with a RST. This document + specifies procedures that TCP implementations may use to provide + robust connectivity even in the presence of such equipment. + +2. Conventions and Acronyms + + The keywords MUST, MUST NOT, REQUIRED, SHALL, SHALL NOT, SHOULD, + SHOULD NOT, RECOMMENDED, MAY, and OPTIONAL, when they appear in this + document, are to be interpreted as described in [RFC2119]. + +3. Assumptions and General Principles + + In this section, we describe some of the important design principles + and assumptions that guided the design choices in this proposal. + + * Because ECN is likely to be adopted gradually, accommodating + migration is essential. Some routers may still only drop packets + to indicate congestion, and some end-systems may not be ECN- + capable. The most viable strategy is one that accommodates + incremental deployment without having to resort to "islands" of + ECN-capable and non-ECN-capable environments. + + * New mechanisms for congestion control and avoidance need to co- + exist and cooperate with existing mechanisms for congestion + control. In particular, new mechanisms have to co-exist with + TCP's current methods of adapting to congestion and with + routers' current practice of dropping packets in periods of + congestion. + + * Congestion may persist over different time-scales. The time + scales that we are concerned with are congestion events that may + last longer than a round-trip time. + + * The number of packets in an individual flow (e.g., TCP + connection or an exchange using UDP) may range from a small + number of packets to quite a large number. We are interested in + managing the congestion caused by flows that send enough packets + so that they are still active when network feedback reaches + them. + + * Asymmetric routing is likely to be a normal occurrence in the + Internet. The path (sequence of links and routers) followed by + data packets may be different from the path followed by the + acknowledgment packets in the reverse direction. + + + + + +Ramakrishnan, et al. Standards Track [Page 5] + +RFC 3168 The Addition of ECN to IP September 2001 + + + * Many routers process the "regular" headers in IP packets more + efficiently than they process the header information in IP + options. This suggests keeping congestion experienced + information in the regular headers of an IP packet. + + * It must be recognized that not all end-systems will cooperate in + mechanisms for congestion control. However, new mechanisms + shouldn't make it easier for TCP applications to disable TCP + congestion control. The benefit of lying about participating in + new mechanisms such as ECN-capability should be small. + +4. Active Queue Management (AQM) + + Random Early Detection (RED) is one mechanism for Active Queue + Management (AQM) that has been proposed to detect incipient + congestion [FJ93], and is currently being deployed in the Internet + [RFC2309]. AQM is meant to be a general mechanism using one of + several alternatives for congestion indication, but in the absence of + ECN, AQM is restricted to using packet drops as a mechanism for + congestion indication. AQM drops packets based on the average queue + length exceeding a threshold, rather than only when the queue + overflows. However, because AQM may drop packets before the queue + actually overflows, AQM is not always forced by memory limitations to + discard the packet. + + AQM can set a Congestion Experienced (CE) codepoint in the packet + header instead of dropping the packet, when such a field is provided + in the IP header and understood by the transport protocol. The use + of the CE codepoint with ECN allows the receiver(s) to receive the + packet, avoiding the potential for excessive delays due to + retransmissions after packet losses. We use the term 'CE packet' to + denote a packet that has the CE codepoint set. + +5. Explicit Congestion Notification in IP + + This document specifies that the Internet provide a congestion + indication for incipient congestion (as in RED and earlier work + [RJ90]) where the notification can sometimes be through marking + packets rather than dropping them. This uses an ECN field in the IP + header with two bits, making four ECN codepoints, '00' to '11'. The + ECN-Capable Transport (ECT) codepoints '10' and '01' are set by the + data sender to indicate that the end-points of the transport protocol + are ECN-capable; we call them ECT(0) and ECT(1) respectively. The + phrase "the ECT codepoint" in this documents refers to either of the + two ECT codepoints. Routers treat the ECT(0) and ECT(1) codepoints + as equivalent. Senders are free to use either the ECT(0) or the + ECT(1) codepoint to indicate ECT, on a packet-by-packet basis. + + + + +Ramakrishnan, et al. Standards Track [Page 6] + +RFC 3168 The Addition of ECN to IP September 2001 + + + The use of both the two codepoints for ECT, ECT(0) and ECT(1), is + motivated primarily by the desire to allow mechanisms for the data + sender to verify that network elements are not erasing the CE + codepoint, and that data receivers are properly reporting to the + sender the receipt of packets with the CE codepoint set, as required + by the transport protocol. Guidelines for the senders and receivers + to differentiate between the ECT(0) and ECT(1) codepoints will be + addressed in separate documents, for each transport protocol. In + particular, this document does not address mechanisms for TCP end- + nodes to differentiate between the ECT(0) and ECT(1) codepoints. + Protocols and senders that only require a single ECT codepoint SHOULD + use ECT(0). + + The not-ECT codepoint '00' indicates a packet that is not using ECN. + The CE codepoint '11' is set by a router to indicate congestion to + the end nodes. Routers that have a packet arriving at a full queue + drop the packet, just as they do in the absence of ECN. + + +-----+-----+ + | ECN FIELD | + +-----+-----+ + ECT CE [Obsolete] RFC 2481 names for the ECN bits. + 0 0 Not-ECT + 0 1 ECT(1) + 1 0 ECT(0) + 1 1 CE + + Figure 1: The ECN Field in IP. + + The use of two ECT codepoints essentially gives a one-bit ECN nonce + in packet headers, and routers necessarily "erase" the nonce when + they set the CE codepoint [SCWA99]. For example, routers that erased + the CE codepoint would face additional difficulty in reconstructing + the original nonce, and thus repeated erasure of the CE codepoint + would be more likely to be detected by the end-nodes. The ECN nonce + also can address the problem of misbehaving transport receivers lying + to the transport sender about whether or not the CE codepoint was set + in a packet. The motivations for the use of two ECT codepoints is + discussed in more detail in Section 20, along with some discussion of + alternate possibilities for the fourth ECT codepoint (that is, the + codepoint '01'). Backwards compatibility with earlier ECN + implementations that do not understand the ECT(1) codepoint is + discussed in Section 11. + + In RFC 2481 [RFC2481], the ECN field was divided into the ECN-Capable + Transport (ECT) bit and the CE bit. The ECN field with only the + ECN-Capable Transport (ECT) bit set in RFC 2481 corresponds to the + ECT(0) codepoint in this document, and the ECN field with both the + + + +Ramakrishnan, et al. Standards Track [Page 7] + +RFC 3168 The Addition of ECN to IP September 2001 + + + ECT and CE bit in RFC 2481 corresponds to the CE codepoint in this + document. The '01' codepoint was left undefined in RFC 2481, and + this is the reason for recommending the use of ECT(0) when only a + single ECT codepoint is needed. + + 0 1 2 3 4 5 6 7 + +-----+-----+-----+-----+-----+-----+-----+-----+ + | DS FIELD, DSCP | ECN FIELD | + +-----+-----+-----+-----+-----+-----+-----+-----+ + + DSCP: differentiated services codepoint + ECN: Explicit Congestion Notification + + Figure 2: The Differentiated Services and ECN Fields in IP. + + Bits 6 and 7 in the IPv4 TOS octet are designated as the ECN field. + The IPv4 TOS octet corresponds to the Traffic Class octet in IPv6, + and the ECN field is defined identically in both cases. The + definitions for the IPv4 TOS octet [RFC791] and the IPv6 Traffic + Class octet have been superseded by the six-bit DS (Differentiated + Services) Field [RFC2474, RFC2780]. Bits 6 and 7 are listed in + [RFC2474] as Currently Unused, and are specified in RFC 2780 as + approved for experimental use for ECN. Section 22 gives a brief + history of the TOS octet. + + Because of the unstable history of the TOS octet, the use of the ECN + field as specified in this document cannot be guaranteed to be + backwards compatible with those past uses of these two bits that + pre-date ECN. The potential dangers of this lack of backwards + compatibility are discussed in Section 22. + + Upon the receipt by an ECN-Capable transport of a single CE packet, + the congestion control algorithms followed at the end-systems MUST be + essentially the same as the congestion control response to a *single* + dropped packet. For example, for ECN-Capable TCP the source TCP is + required to halve its congestion window for any window of data + containing either a packet drop or an ECN indication. + + One reason for requiring that the congestion-control response to the + CE packet be essentially the same as the response to a dropped packet + is to accommodate the incremental deployment of ECN in both end- + systems and in routers. Some routers may drop ECN-Capable packets + (e.g., using the same AQM policies for congestion detection) while + other routers set the CE codepoint, for equivalent levels of + congestion. Similarly, a router might drop a non-ECN-Capable packet + but set the CE codepoint in an ECN-Capable packet, for equivalent + + + + + +Ramakrishnan, et al. Standards Track [Page 8] + +RFC 3168 The Addition of ECN to IP September 2001 + + + levels of congestion. If there were different congestion control + responses to a CE codepoint than to a packet drop, this could result + in unfair treatment for different flows. + + An additional goal is that the end-systems should react to congestion + at most once per window of data (i.e., at most once per round-trip + time), to avoid reacting multiple times to multiple indications of + congestion within a round-trip time. + + For a router, the CE codepoint of an ECN-Capable packet SHOULD only + be set if the router would otherwise have dropped the packet as an + indication of congestion to the end nodes. When the router's buffer + is not yet full and the router is prepared to drop a packet to inform + end nodes of incipient congestion, the router should first check to + see if the ECT codepoint is set in that packet's IP header. If so, + then instead of dropping the packet, the router MAY instead set the + CE codepoint in the IP header. + + An environment where all end nodes were ECN-Capable could allow new + criteria to be developed for setting the CE codepoint, and new + congestion control mechanisms for end-node reaction to CE packets. + However, this is a research issue, and as such is not addressed in + this document. + + When a CE packet (i.e., a packet that has the CE codepoint set) is + received by a router, the CE codepoint is left unchanged, and the + packet is transmitted as usual. When severe congestion has occurred + and the router's queue is full, then the router has no choice but to + drop some packet when a new packet arrives. We anticipate that such + packet losses will become relatively infrequent when a majority of + end-systems become ECN-Capable and participate in TCP or other + compatible congestion control mechanisms. In an ECN-Capable + environment that is adequately-provisioned, packet losses should + occur primarily during transients or in the presence of non- + cooperating sources. + + The above discussion of when CE may be set instead of dropping a + packet applies by default to all Differentiated Services Per-Hop + Behaviors (PHBs) [RFC 2475]. Specifications for PHBs MAY provide + more specifics on how a compliant implementation is to choose between + setting CE and dropping a packet, but this is NOT REQUIRED. A router + MUST NOT set CE instead of dropping a packet when the drop that would + occur is caused by reasons other than congestion or the desire to + indicate incipient congestion to end nodes (e.g., a diffserv edge + node may be configured to unconditionally drop certain classes of + traffic to prevent them from entering its diffserv domain). + + + + + +Ramakrishnan, et al. Standards Track [Page 9] + +RFC 3168 The Addition of ECN to IP September 2001 + + + We expect that routers will set the CE codepoint in response to + incipient congestion as indicated by the average queue size, using + the RED algorithms suggested in [FJ93, RFC2309]. To the best of our + knowledge, this is the only proposal currently under discussion in + the IETF for routers to drop packets proactively, before the buffer + overflows. However, this document does not attempt to specify a + particular mechanism for active queue management, leaving that + endeavor, if needed, to other areas of the IETF. While ECN is + inextricably tied up with the need to have a reasonable active queue + management mechanism at the router, the reverse does not hold; active + queue management mechanisms have been developed and deployed + independent of ECN, using packet drops as indications of congestion + in the absence of ECN in the IP architecture. + +5.1. ECN as an Indication of Persistent Congestion + + We emphasize that a *single* packet with the CE codepoint set in an + IP packet causes the transport layer to respond, in terms of + congestion control, as it would to a packet drop. The instantaneous + queue size is likely to see considerable variations even when the + router does not experience persistent congestion. As such, it is + important that transient congestion at a router, reflected by the + instantaneous queue size reaching a threshold much smaller than the + capacity of the queue, not trigger a reaction at the transport layer. + Therefore, the CE codepoint should not be set by a router based on + the instantaneous queue size. + + For example, since the ATM and Frame Relay mechanisms for congestion + indication have typically been defined without an associated notion + of average queue size as the basis for determining that an + intermediate node is congested, we believe that they provide a very + noisy signal. The TCP-sender reaction specified in this document for + ECN is NOT the appropriate reaction for such a noisy signal of + congestion notification. However, if the routers that interface to + the ATM network have a way of maintaining the average queue at the + interface, and use it to come to a reliable determination that the + ATM subnet is congested, they may use the ECN notification that is + defined here. + + We continue to encourage experiments in techniques at layer 2 (e.g., + in ATM switches or Frame Relay switches) to take advantage of ECN. + For example, using a scheme such as RED (where packet marking is + based on the average queue length exceeding a threshold), layer 2 + devices could provide a reasonably reliable indication of congestion. + When all the layer 2 devices in a path set that layer's own + Congestion Experienced codepoint (e.g., the EFCI bit for ATM, the + FECN bit in Frame Relay) in this reliable manner, then the interface + router to the layer 2 network could copy the state of that layer 2 + + + +Ramakrishnan, et al. Standards Track [Page 10] + +RFC 3168 The Addition of ECN to IP September 2001 + + + Congestion Experienced codepoint into the CE codepoint in the IP + header. We recognize that this is not the current practice, nor is + it in current standards. However, encouraging experimentation in this + manner may provide the information needed to enable evolution of + existing layer 2 mechanisms to provide a more reliable means of + congestion indication, when they use a single bit for indicating + congestion. + +5.2. Dropped or Corrupted Packets + + For the proposed use for ECN in this document (that is, for a + transport protocol such as TCP for which a dropped data packet is an + indication of congestion), end nodes detect dropped data packets, and + the congestion response of the end nodes to a dropped data packet is + at least as strong as the congestion response to a received CE + packet. To ensure the reliable delivery of the congestion indication + of the CE codepoint, an ECT codepoint MUST NOT be set in a packet + unless the loss of that packet in the network would be detected by + the end nodes and interpreted as an indication of congestion. + + Transport protocols such as TCP do not necessarily detect all packet + drops, such as the drop of a "pure" ACK packet; for example, TCP does + not reduce the arrival rate of subsequent ACK packets in response to + an earlier dropped ACK packet. Any proposal for extending ECN- + Capability to such packets would have to address issues such as the + case of an ACK packet that was marked with the CE codepoint but was + later dropped in the network. We believe that this aspect is still + the subject of research, so this document specifies that at this + time, "pure" ACK packets MUST NOT indicate ECN-Capability. + + Similarly, if a CE packet is dropped later in the network due to + corruption (bit errors), the end nodes should still invoke congestion + control, just as TCP would today in response to a dropped data + packet. This issue of corrupted CE packets would have to be + considered in any proposal for the network to distinguish between + packets dropped due to corruption, and packets dropped due to + congestion or buffer overflow. In particular, the ubiquitous + deployment of ECN would not, in and of itself, be a sufficient + development to allow end-nodes to interpret packet drops as + indications of corruption rather than congestion. + +5.3. Fragmentation + + ECN-capable packets MAY have the DF (Don't Fragment) bit set. + Reassembly of a fragmented packet MUST NOT lose indications of + congestion. In other words, if any fragment of an IP packet to be + reassembled has the CE codepoint set, then one of two actions MUST be + taken: + + + +Ramakrishnan, et al. Standards Track [Page 11] + +RFC 3168 The Addition of ECN to IP September 2001 + + + * Set the CE codepoint on the reassembled packet. However, this + MUST NOT occur if any of the other fragments contributing to + this reassembly carries the Not-ECT codepoint. + + * The packet is dropped, instead of being reassembled, for any + other reason. + + If both actions are applicable, either MAY be chosen. Reassembly of + a fragmented packet MUST NOT change the ECN codepoint when all of the + fragments carry the same codepoint. + + We would note that because RFC 2481 did not specify reassembly + behavior, older ECN implementations conformant with that Experimental + RFC do not necessarily perform reassembly correctly, in terms of + preserving the CE codepoint in a fragment. The sender could avoid + the consequences of this behavior by setting the DF bit in ECN- + Capable packets. + + Situations may arise in which the above reassembly specification is + insufficiently precise. For example, if there is a malicious or + broken entity in the path at or after the fragmentation point, packet + fragments could carry a mixture of ECT(0), ECT(1), and/or Not-ECT + codepoints. The reassembly specification above does not place + requirements on reassembly of fragments in this case. In situations + where more precise reassembly behavior would be required, protocol + specifications SHOULD instead specify that DF MUST be set in all + ECN-capable packets sent by the protocol. + +6. Support from the Transport Protocol + + ECN requires support from the transport protocol, in addition to the + functionality given by the ECN field in the IP packet header. The + transport protocol might require negotiation between the endpoints + during setup to determine that all of the endpoints are ECN-capable, + so that the sender can set the ECT codepoint in transmitted packets. + Second, the transport protocol must be capable of reacting + appropriately to the receipt of CE packets. This reaction could be + in the form of the data receiver informing the data sender of the + received CE packet (e.g., TCP), of the data receiver unsubscribing to + a layered multicast group (e.g., RLM [MJV96]), or of some other + action that ultimately reduces the arrival rate of that flow on that + congested link. CE packets indicate persistent rather than transient + congestion (see Section 5.1), and hence reactions to the receipt of + CE packets should be those appropriate for persistent congestion. + + This document only addresses the addition of ECN Capability to TCP, + leaving issues of ECN in other transport protocols to further + research. For TCP, ECN requires three new pieces of functionality: + + + +Ramakrishnan, et al. Standards Track [Page 12] + +RFC 3168 The Addition of ECN to IP September 2001 + + + negotiation between the endpoints during connection setup to + determine if they are both ECN-capable; an ECN-Echo (ECE) flag in the + TCP header so that the data receiver can inform the data sender when + a CE packet has been received; and a Congestion Window Reduced (CWR) + flag in the TCP header so that the data sender can inform the data + receiver that the congestion window has been reduced. The support + required from other transport protocols is likely to be different, + particularly for unreliable or reliable multicast transport + protocols, and will have to be determined as other transport + protocols are brought to the IETF for standardization. + + In a mild abuse of terminology, in this document we refer to `TCP + packets' instead of `TCP segments'. + +6.1. TCP + + The following sections describe in detail the proposed use of ECN in + TCP. This proposal is described in essentially the same form in + [Floyd94]. We assume that the source TCP uses the standard congestion + control algorithms of Slow-start, Fast Retransmit and Fast Recovery + [RFC2581]. + + This proposal specifies two new flags in the Reserved field of the + TCP header. The TCP mechanism for negotiating ECN-Capability uses + the ECN-Echo (ECE) flag in the TCP header. Bit 9 in the Reserved + field of the TCP header is designated as the ECN-Echo flag. The + location of the 6-bit Reserved field in the TCP header is shown in + Figure 4 of RFC 793 [RFC793] (and is reproduced below for + completeness). This specification of the ECN Field leaves the + Reserved field as a 4-bit field using bits 4-7. + + To enable the TCP receiver to determine when to stop setting the + ECN-Echo flag, we introduce a second new flag in the TCP header, the + CWR flag. The CWR flag is assigned to Bit 8 in the Reserved field of + the TCP header. + + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | | | U | A | P | R | S | F | + | Header Length | Reserved | R | C | S | S | Y | I | + | | | G | K | H | T | N | N | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + + Figure 3: The old definition of bytes 13 and 14 of the TCP + header. + + + + + + +Ramakrishnan, et al. Standards Track [Page 13] + +RFC 3168 The Addition of ECN to IP September 2001 + + + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | | | C | E | U | A | P | R | S | F | + | Header Length | Reserved | W | C | R | C | S | S | Y | I | + | | | R | E | G | K | H | T | N | N | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + + Figure 4: The new definition of bytes 13 and 14 of the TCP + Header. + + Thus, ECN uses the ECT and CE flags in the IP header (as shown in + Figure 1) for signaling between routers and connection endpoints, and + uses the ECN-Echo and CWR flags in the TCP header (as shown in Figure + 4) for TCP-endpoint to TCP-endpoint signaling. For a TCP connection, + a typical sequence of events in an ECN-based reaction to congestion + is as follows: + + * An ECT codepoint is set in packets transmitted by the sender to + indicate that ECN is supported by the transport entities for + these packets. + + * An ECN-capable router detects impending congestion and detects + that an ECT codepoint is set in the packet it is about to drop. + Instead of dropping the packet, the router chooses to set the CE + codepoint in the IP header and forwards the packet. + + * The receiver receives the packet with the CE codepoint set, and + sets the ECN-Echo flag in its next TCP ACK sent to the sender. + + * The sender receives the TCP ACK with ECN-Echo set, and reacts to + the congestion as if a packet had been dropped. + + * The sender sets the CWR flag in the TCP header of the next + packet sent to the receiver to acknowledge its receipt of and + reaction to the ECN-Echo flag. + + The negotiation for using ECN by the TCP transport entities and the + use of the ECN-Echo and CWR flags is described in more detail in the + sections below. + +6.1.1 TCP Initialization + + In the TCP connection setup phase, the source and destination TCPs + exchange information about their willingness to use ECN. Subsequent + to the completion of this negotiation, the TCP sender sets an ECT + codepoint in the IP header of data packets to indicate to the network + that the transport is capable and willing to participate in ECN for + this packet. This indicates to the routers that they may mark this + + + +Ramakrishnan, et al. Standards Track [Page 14] + +RFC 3168 The Addition of ECN to IP September 2001 + + + packet with the CE codepoint, if they would like to use that as a + method of congestion notification. If the TCP connection does not + wish to use ECN notification for a particular packet, the sending TCP + sets the ECN codepoint to not-ECT, and the TCP receiver ignores the + CE codepoint in the received packet. + + For this discussion, we designate the initiating host as Host A and + the responding host as Host B. We call a SYN packet with the ECE and + CWR flags set an "ECN-setup SYN packet", and we call a SYN packet + with at least one of the ECE and CWR flags not set a "non-ECN-setup + SYN packet". Similarly, we call a SYN-ACK packet with only the ECE + flag set but the CWR flag not set an "ECN-setup SYN-ACK packet", and + we call a SYN-ACK packet with any other configuration of the ECE and + CWR flags a "non-ECN-setup SYN-ACK packet". + + Before a TCP connection can use ECN, Host A sends an ECN-setup SYN + packet, and Host B sends an ECN-setup SYN-ACK packet. For a SYN + packet, the setting of both ECE and CWR in the ECN-setup SYN packet + is defined as an indication that the sending TCP is ECN-Capable, + rather than as an indication of congestion or of response to + congestion. More precisely, an ECN-setup SYN packet indicates that + the TCP implementation transmitting the SYN packet will participate + in ECN as both a sender and receiver. Specifically, as a receiver, + it will respond to incoming data packets that have the CE codepoint + set in the IP header by setting ECE in outgoing TCP Acknowledgement + (ACK) packets. As a sender, it will respond to incoming packets that + have ECE set by reducing the congestion window and setting CWR when + appropriate. An ECN-setup SYN packet does not commit the TCP sender + to setting the ECT codepoint in any or all of the packets it may + transmit. However, the commitment to respond appropriately to + incoming packets with the CE codepoint set remains even if the TCP + sender in a later transmission, within this TCP connection, sends a + SYN packet without ECE and CWR set. + + When Host B sends an ECN-setup SYN-ACK packet, it sets the ECE flag + but not the CWR flag. An ECN-setup SYN-ACK packet is defined as an + indication that the TCP transmitting the SYN-ACK packet is ECN- + Capable. As with the SYN packet, an ECN-setup SYN-ACK packet does + not commit the TCP host to setting the ECT codepoint in transmitted + packets. + + The following rules apply to the sending of ECN-setup packets within + a TCP connection, where a TCP connection is defined by the standard + rules for TCP connection establishment and termination. + + * If a host has received an ECN-setup SYN packet, then it MAY send + an ECN-setup SYN-ACK packet. Otherwise, it MUST NOT send an + ECN-setup SYN-ACK packet. + + + +Ramakrishnan, et al. Standards Track [Page 15] + +RFC 3168 The Addition of ECN to IP September 2001 + + + * A host MUST NOT set ECT on data packets unless it has sent at + least one ECN-setup SYN or ECN-setup SYN-ACK packet, and has + received at least one ECN-setup SYN or ECN-setup SYN-ACK packet, + and has sent no non-ECN-setup SYN or non-ECN-setup SYN-ACK + packet. If a host has received at least one non-ECN-setup SYN + or non-ECN-setup SYN-ACK packet, then it SHOULD NOT set ECT on + data packets. + + * If a host ever sets the ECT codepoint on a data packet, then + that host MUST correctly set/clear the CWR TCP bit on all + subsequent packets in the connection. + + * If a host has sent at least one ECN-setup SYN or ECN-setup SYN- + ACK packet, and has received no non-ECN-setup SYN or non-ECN- + setup SYN-ACK packet, then if that host receives TCP data + packets with ECT and CE codepoints set in the IP header, then + that host MUST process these packets as specified for an ECN- + capable connection. + + * A host that is not willing to use ECN on a TCP connection SHOULD + clear both the ECE and CWR flags in all non-ECN-setup SYN and/or + SYN-ACK packets that it sends to indicate this unwillingness. + Receivers MUST correctly handle all forms of the non-ECN-setup + SYN and SYN-ACK packets. + + * A host MUST NOT set ECT on SYN or SYN-ACK packets. + + A TCP client enters TIME-WAIT state after receiving a FIN-ACK, and + transitions to CLOSED state after a timeout. Many TCP + implementations create a new TCP connection if they receive an in- + window SYN packet during TIME-WAIT state. When a TCP host enters + TIME-WAIT or CLOSED state, it should ignore any previous state about + the negotiation of ECN for that connection. + +6.1.1.1. Middlebox Issues + + ECN introduces the use of the ECN-Echo and CWR flags in the TCP + header (as shown in Figure 3) for initialization. There exist some + faulty firewalls, load balancers, and intrusion detection systems in + the Internet that either drop an ECN-setup SYN packet or respond with + a RST, in the belief that such a packet (with these bits set) is a + signature for a port-scanning tool that could be used in a denial- + of-service attack. Some of the offending equipment has been + identified, and a web page [FIXES] contains a list of non-compliant + products and the fixes posted by the vendors, where these are + available. The TBIT web page [TBIT] lists some of the web servers + affected by this faulty equipment. We mention this in this document + as a warning to the community of this problem. + + + +Ramakrishnan, et al. Standards Track [Page 16] + +RFC 3168 The Addition of ECN to IP September 2001 + + + To provide robust connectivity even in the presence of such faulty + equipment, a host that receives a RST in response to the transmission + of an ECN-setup SYN packet MAY resend a SYN with CWR and ECE cleared. + This could result in a TCP connection being established without using + ECN. + + A host that receives no reply to an ECN-setup SYN within the normal + SYN retransmission timeout interval MAY resend the SYN and any + subsequent SYN retransmissions with CWR and ECE cleared. To overcome + normal packet loss that results in the original SYN being lost, the + originating host may retransmit one or more ECN-setup SYN packets + before giving up and retransmitting the SYN with the CWR and ECE bits + cleared. + + We note that in this case, the following example scenario is + possible: + + (1) Host A: Sends an ECN-setup SYN. + (2) Host B: Sends an ECN-setup SYN/ACK, packet is dropped or delayed. + (3) Host A: Sends a non-ECN-setup SYN. + (4) Host B: Sends a non-ECN-setup SYN/ACK. + + We note that in this case, following the procedures above, neither + Host A nor Host B may set the ECT bit on data packets. Further, an + important consequence of the rules for ECN setup and usage in Section + 6.1.1 is that a host is forbidden from using the reception of ECT + data packets as an implicit signal that the other host is ECN- + capable. + +6.1.1.2. Robust TCP Initialization with an Echoed Reserved Field + + There is the question of why we chose to have the TCP sending the SYN + set two ECN-related flags in the Reserved field of the TCP header for + the SYN packet, while the responding TCP sending the SYN-ACK sets + only one ECN-related flag in the SYN-ACK packet. This asymmetry is + necessary for the robust negotiation of ECN-capability with some + deployed TCP implementations. There exists at least one faulty TCP + implementation in which TCP receivers set the Reserved field of the + TCP header in ACK packets (and hence the SYN-ACK) simply to reflect + the Reserved field of the TCP header in the received data packet. + Because the TCP SYN packet sets the ECN-Echo and CWR flags to + indicate ECN-capability, while the SYN-ACK packet sets only the ECN- + Echo flag, the sending TCP correctly interprets a receiver's + reflection of its own flags in the Reserved field as an indication + that the receiver is not ECN-capable. The sending TCP is not mislead + by a faulty TCP implementation sending a SYN-ACK packet that simply + reflects the Reserved field of the incoming SYN packet. + + + + +Ramakrishnan, et al. Standards Track [Page 17] + +RFC 3168 The Addition of ECN to IP September 2001 + + +6.1.2. The TCP Sender + + For a TCP connection using ECN, new data packets are transmitted with + an ECT codepoint set in the IP header. When only one ECT codepoint + is needed by a sender for all packets sent on a TCP connection, + ECT(0) SHOULD be used. If the sender receives an ECN-Echo (ECE) ACK + packet (that is, an ACK packet with the ECN-Echo flag set in the TCP + header), then the sender knows that congestion was encountered in the + network on the path from the sender to the receiver. The indication + of congestion should be treated just as a congestion loss in non- + ECN-Capable TCP. That is, the TCP source halves the congestion window + "cwnd" and reduces the slow start threshold "ssthresh". The sending + TCP SHOULD NOT increase the congestion window in response to the + receipt of an ECN-Echo ACK packet. + + TCP should not react to congestion indications more than once every + window of data (or more loosely, more than once every round-trip + time). That is, the TCP sender's congestion window should be reduced + only once in response to a series of dropped and/or CE packets from a + single window of data. In addition, the TCP source should not + decrease the slow-start threshold, ssthresh, if it has been decreased + within the last round trip time. However, if any retransmitted + packets are dropped, then this is interpreted by the source TCP as a + new instance of congestion. + + After the source TCP reduces its congestion window in response to a + CE packet, incoming acknowledgments that continue to arrive can + "clock out" outgoing packets as allowed by the reduced congestion + window. If the congestion window consists of only one MSS (maximum + segment size), and the sending TCP receives an ECN-Echo ACK packet, + then the sending TCP should in principle still reduce its congestion + window in half. However, the value of the congestion window is + bounded below by a value of one MSS. If the sending TCP were to + continue to send, using a congestion window of 1 MSS, this results in + the transmission of one packet per round-trip time. It is necessary + to still reduce the sending rate of the TCP sender even further, on + receipt of an ECN-Echo packet when the congestion window is one. We + use the retransmit timer as a means of reducing the rate further in + this circumstance. Therefore, the sending TCP MUST reset the + retransmit timer on receiving the ECN-Echo packet when the congestion + window is one. The sending TCP will then be able to send a new + packet only when the retransmit timer expires. + + When an ECN-Capable TCP sender reduces its congestion window for any + reason (because of a retransmit timeout, a Fast Retransmit, or in + response to an ECN Notification), the TCP sender sets the CWR flag in + the TCP header of the first new data packet sent after the window + reduction. If that data packet is dropped in the network, then the + + + +Ramakrishnan, et al. Standards Track [Page 18] + +RFC 3168 The Addition of ECN to IP September 2001 + + + sending TCP will have to reduce the congestion window again and + retransmit the dropped packet. + + We ensure that the "Congestion Window Reduced" information is + reliably delivered to the TCP receiver. This comes about from the + fact that if the new data packet carrying the CWR flag is dropped, + then the TCP sender will have to again reduce its congestion window, + and send another new data packet with the CWR flag set. Thus, the + CWR bit in the TCP header SHOULD NOT be set on retransmitted packets. + + When the TCP data sender is ready to set the CWR bit after reducing + the congestion window, it SHOULD set the CWR bit only on the first + new data packet that it transmits. + + [Floyd94] discusses TCP's response to ECN in more detail. [Floyd98] + discusses the validation test in the ns simulator, which illustrates + a wide range of ECN scenarios. These scenarios include the following: + an ECN followed by another ECN, a Fast Retransmit, or a Retransmit + Timeout; a Retransmit Timeout or a Fast Retransmit followed by an + ECN; and a congestion window of one packet followed by an ECN. + + TCP follows existing algorithms for sending data packets in response + to incoming ACKs, multiple duplicate acknowledgments, or retransmit + timeouts [RFC2581]. TCP also follows the normal procedures for + increasing the congestion window when it receives ACK packets without + the ECN-Echo bit set [RFC2581]. + +6.1.3. The TCP Receiver + + When TCP receives a CE data packet at the destination end-system, the + TCP data receiver sets the ECN-Echo flag in the TCP header of the + subsequent ACK packet. If there is any ACK withholding implemented, + as in current "delayed-ACK" TCP implementations where the TCP + receiver can send an ACK for two arriving data packets, then the + ECN-Echo flag in the ACK packet will be set to '1' if the CE + codepoint is set in any of the data packets being acknowledged. That + is, if any of the received data packets are CE packets, then the + returning ACK has the ECN-Echo flag set. + + To provide robustness against the possibility of a dropped ACK packet + carrying an ECN-Echo flag, the TCP receiver sets the ECN-Echo flag in + a series of ACK packets sent subsequently. The TCP receiver uses the + CWR flag received from the TCP sender to determine when to stop + setting the ECN-Echo flag. + + After a TCP receiver sends an ACK packet with the ECN-Echo bit set, + that TCP receiver continues to set the ECN-Echo flag in all the ACK + packets it sends (whether they acknowledge CE data packets or non-CE + + + +Ramakrishnan, et al. Standards Track [Page 19] + +RFC 3168 The Addition of ECN to IP September 2001 + + + data packets) until it receives a CWR packet (a packet with the CWR + flag set). After the receipt of the CWR packet, acknowledgments for + subsequent non-CE data packets do not have the ECN-Echo flag set. If + another CE packet is received by the data receiver, the receiver + would once again send ACK packets with the ECN-Echo flag set. While + the receipt of a CWR packet does not guarantee that the data sender + received the ECN-Echo message, this does suggest that the data sender + reduced its congestion window at some point *after* it sent the data + packet for which the CE codepoint was set. + + We have already specified that a TCP sender is not required to reduce + its congestion window more than once per window of data. Some care + is required if the TCP sender is to avoid unnecessary reductions of + the congestion window when a window of data includes both dropped + packets and (marked) CE packets. This is illustrated in [Floyd98]. + +6.1.4. Congestion on the ACK-path + + For the current generation of TCP congestion control algorithms, pure + acknowledgement packets (e.g., packets that do not contain any + accompanying data) MUST be sent with the not-ECT codepoint. Current + TCP receivers have no mechanisms for reducing traffic on the ACK-path + in response to congestion notification. Mechanisms for responding to + congestion on the ACK-path are areas for current and future research. + (One simple possibility would be for the sender to reduce its + congestion window when it receives a pure ACK packet with the CE + codepoint set). For current TCP implementations, a single dropped ACK + generally has only a very small effect on the TCP's sending rate. + +6.1.5. Retransmitted TCP packets + + This document specifies ECN-capable TCP implementations MUST NOT set + either ECT codepoint (ECT(0) or ECT(1)) in the IP header for + retransmitted data packets, and that the TCP data receiver SHOULD + ignore the ECN field on arriving data packets that are outside of the + receiver's current window. This is for greater security against + denial-of-service attacks, as well as for robustness of the ECN + congestion indication with packets that are dropped later in the + network. + + First, we note that if the TCP sender were to set an ECT codepoint on + a retransmitted packet, then if an unnecessarily-retransmitted packet + was later dropped in the network, the end nodes would never receive + the indication of congestion from the router setting the CE + codepoint. Thus, setting an ECT codepoint on retransmitted data + packets is not consistent with the robust delivery of the congestion + indication even for packets that are later dropped in the network. + + + + +Ramakrishnan, et al. Standards Track [Page 20] + +RFC 3168 The Addition of ECN to IP September 2001 + + + In addition, an attacker capable of spoofing the IP source address of + the TCP sender could send data packets with arbitrary sequence + numbers, with the CE codepoint set in the IP header. On receiving + this spoofed data packet, the TCP data receiver would determine that + the data does not lie in the current receive window, and return a + duplicate acknowledgement. We define an out-of-window packet at the + TCP data receiver as a data packet that lies outside the receiver's + current window. On receiving an out-of-window packet, the TCP data + receiver has to decide whether or not to treat the CE codepoint in + the packet header as a valid indication of congestion, and therefore + whether to return ECN-Echo indications to the TCP data sender. If + the TCP data receiver ignored the CE codepoint in an out-of-window + packet, then the TCP data sender would not receive this possibly- + legitimate indication of congestion from the network, resulting in a + violation of end-to-end congestion control. On the other hand, if + the TCP data receiver honors the CE indication in the out-of-window + packet, and reports the indication of congestion to the TCP data + sender, then the malicious node that created the spoofed, out-of- + window packet has successfully "attacked" the TCP connection by + forcing the data sender to unnecessarily reduce (halve) its + congestion window. To prevent such a denial-of-service attack, we + specify that a legitimate TCP data sender MUST NOT set an ECT + codepoint on retransmitted data packets, and that the TCP data + receiver SHOULD ignore the CE codepoint on out-of-window packets. + + One drawback of not setting ECT(0) or ECT(1) on retransmitted packets + is that it denies ECN protection for retransmitted packets. However, + for an ECN-capable TCP connection in a fully-ECN-capable environment + with mild congestion, packets should rarely be dropped due to + congestion in the first place, and so instances of retransmitted + packets should rarely arise. If packets are being retransmitted, + then there are already packet losses (from corruption or from + congestion) that ECN has been unable to prevent. + + We note that if the router sets the CE codepoint for an ECN-capable + data packet within a TCP connection, then the TCP connection is + guaranteed to receive that indication of congestion, or to receive + some other indication of congestion within the same window of data, + even if this packet is dropped or reordered in the network. We + consider two cases, when the packet is later retransmitted, and when + the packet is not later retransmitted. + + In the first case, if the packet is either dropped or delayed, and at + some point retransmitted by the data sender, then the retransmission + is a result of a Fast Retransmit or a Retransmit Timeout for either + that packet or for some prior packet in the same window of data. In + this case, because the data sender already has retransmitted this + packet, we know that the data sender has already responded to an + + + +Ramakrishnan, et al. Standards Track [Page 21] + +RFC 3168 The Addition of ECN to IP September 2001 + + + indication of congestion for some packet within the same window of + data as the original packet. Thus, even if the first transmission of + the packet is dropped in the network, or is delayed, if it had the CE + codepoint set, and is later ignored by the data receiver as an out- + of-window packet, this is not a problem, because the sender has + already responded to an indication of congestion for that window of + data. + + In the second case, if the packet is never retransmitted by the data + sender, then this data packet is the only copy of this data received + by the data receiver, and therefore arrives at the data receiver as + an in-window packet, regardless of how much the packet might be + delayed or reordered. In this case, if the CE codepoint is set on + the packet within the network, this will be treated by the data + receiver as a valid indication of congestion. + +6.1.6. TCP Window Probes. + + When the TCP data receiver advertises a zero window, the TCP data + sender sends window probes to determine if the receiver's window has + increased. Window probe packets do not contain any user data except + for the sequence number, which is a byte. If a window probe packet + is dropped in the network, this loss is not detected by the receiver. + Therefore, the TCP data sender MUST NOT set either an ECT codepoint + or the CWR bit on window probe packets. + + However, because window probes use exact sequence numbers, they + cannot be easily spoofed in denial-of-service attacks. Therefore, if + a window probe arrives with the CE codepoint set, then the receiver + SHOULD respond to the ECN indications. + +7. Non-compliance by the End Nodes + + This section discusses concerns about the vulnerability of ECN to + non-compliant end-nodes (i.e., end nodes that set the ECT codepoint + in transmitted packets but do not respond to received CE packets). + We argue that the addition of ECN to the IP architecture will not + significantly increase the current vulnerability of the architecture + to unresponsive flows. + + Even for non-ECN environments, there are serious concerns about the + damage that can be done by non-compliant or unresponsive flows (that + is, flows that do not respond to congestion control indications by + reducing their arrival rate at the congested link). For example, an + end-node could "turn off congestion control" by not reducing its + congestion window in response to packet drops. This is a concern for + the current Internet. It has been argued that routers will have to + deploy mechanisms to detect and differentially treat packets from + + + +Ramakrishnan, et al. Standards Track [Page 22] + +RFC 3168 The Addition of ECN to IP September 2001 + + + non-compliant flows [RFC2309,FF99]. It has also been suggested that + techniques such as end-to-end per-flow scheduling and isolation of + one flow from another, differentiated services, or end-to-end + reservations could remove some of the more damaging effects of + unresponsive flows. + + It might seem that dropping packets in itself is an adequate + deterrent for non-compliance, and that the use of ECN removes this + deterrent. We would argue in response that (1) ECN-capable routers + preserve packet-dropping behavior in times of high congestion; and + (2) even in times of high congestion, dropping packets in itself is + not an adequate deterrent for non-compliance. + + First, ECN-Capable routers will only mark packets (as opposed to + dropping them) when the packet marking rate is reasonably low. During + periods where the average queue size exceeds an upper threshold, and + therefore the potential packet marking rate would be high, our + recommendation is that routers drop packets rather then set the CE + codepoint in packet headers. + + During the periods of low or moderate packet marking rates when ECN + would be deployed, there would be little deterrent effect on + unresponsive flows of dropping rather than marking those packets. For + example, delay-insensitive flows using reliable delivery might have + an incentive to increase rather than to decrease their sending rate + in the presence of dropped packets. Similarly, delay-sensitive flows + using unreliable delivery might increase their use of FEC in response + to an increased packet drop rate, increasing rather than decreasing + their sending rate. For the same reasons, we do not believe that + packet dropping itself is an effective deterrent for non-compliance + even in an environment of high packet drop rates, when all flows are + sharing the same packet drop rate. + + Several methods have been proposed to identify and restrict non- + compliant or unresponsive flows. The addition of ECN to the network + environment would not in any way increase the difficulty of designing + and deploying such mechanisms. If anything, the addition of ECN to + the architecture would make the job of identifying unresponsive flows + slightly easier. For example, in an ECN-Capable environment routers + are not limited to information about packets that are dropped or have + the CE codepoint set at that router itself; in such an environment, + routers could also take note of arriving CE packets that indicate + congestion encountered by that packet earlier in the path. + + + + + + + + +Ramakrishnan, et al. Standards Track [Page 23] + +RFC 3168 The Addition of ECN to IP September 2001 + + +8. Non-compliance in the Network + + This section considers the issues when a router is operating, + possibly maliciously, to modify either of the bits in the ECN field. + We note that in IPv4, the IP header is protected from bit errors by a + header checksum; this is not the case in IPv6. Thus for IPv6 the + ECN field can be accidentally modified by bit errors on links or in + routers without being detected by an IP header checksum. + + By tampering with the bits in the ECN field, an adversary (or a + broken router) could do one or more of the following: falsely report + congestion, disable ECN-Capability for an individual packet, erase + the ECN congestion indication, or falsely indicate ECN-Capability. + Section 18 systematically examines the various cases by which the ECN + field could be modified. The important criterion considered in + determining the consequences of such modifications is whether it is + likely to lead to poorer behavior in any dimension (throughput, + delay, fairness or functionality) than if a router were to drop a + packet. + + The first two possible changes, falsely reporting congestion or + disabling ECN-Capability for an individual packet, are no worse than + if the router were to simply drop the packet. From a congestion + control point of view, setting the CE codepoint in the absence of + congestion by a non-compliant router would be no worse than a router + dropping a packet unnecessarily. By "erasing" an ECT codepoint of a + packet that is later dropped in the network, a router's actions could + result in an unnecessary packet drop for that packet later in the + network. + + However, as discussed in Section 18, a router that erases the ECN + congestion indication or falsely indicates ECN-Capability could + potentially do more damage to the flow that if it has simply dropped + the packet. A rogue or broken router that "erased" the CE codepoint + in arriving CE packets would prevent that indication of congestion + from reaching downstream receivers. This could result in the failure + of congestion control for that flow and a resulting increase in + congestion in the network, ultimately resulting in subsequent packets + dropped for this flow as the average queue size increased at the + congested gateway. + + Section 19 considers the potential repercussions of subverting end- + to-end congestion control by either falsely indicating ECN- + Capability, or by erasing the congestion indication in ECN (the CE- + codepoint). We observe in Section 19 that the consequence of + subverting ECN-based congestion control may lead to potential + unfairness, but this is likely to be no worse than the subversion of + either ECN-based or packet-based congestion control by the end nodes. + + + +Ramakrishnan, et al. Standards Track [Page 24] + +RFC 3168 The Addition of ECN to IP September 2001 + + +8.1. Complications Introduced by Split Paths + + If a router or other network element has access to all of the packets + of a flow, then that router could do no more damage to a flow by + altering the ECN field than it could by simply dropping all of the + packets from that flow. However, in some cases, a malicious or + broken router might have access to only a subset of the packets from + a flow. The question is as follows: can this router, by altering + the ECN field in this subset of the packets, do more damage to that + flow than if it has simply dropped that set of the packets? + + This is also discussed in detail in Section 18, which concludes as + follows: It is true that the adversary that has access only to a + subset of packets in an aggregate might, by subverting ECN-based + congestion control, be able to deny the benefits of ECN to the other + packets in the aggregate. While this is undesirable, this is not a + sufficient concern to result in disabling ECN. + +9. Encapsulated Packets + +9.1. IP packets encapsulated in IP + + The encapsulation of IP packet headers in tunnels is used in many + places, including IPsec and IP in IP [RFC2003]. This section + considers issues related to interactions between ECN and IP tunnels, + and specifies two alternative solutions. This discussion is + complemented by RFC 2983's discussion of interactions between + Differentiated Services and IP tunnels of various forms [RFC 2983], + as Differentiated Services uses the remaining six bits of the IP + header octet that is used by ECN (see Figure 2 in Section 5). + + + Some IP tunnel modes are based on adding a new "outer" IP header that + encapsulates the original, or "inner" IP header and its associated + packet. In many cases, the new "outer" IP header may be added and + removed at intermediate points along a connection, enabling the + network to establish a tunnel without requiring endpoint + participation. We denote tunnels that specify that the outer header + be discarded at tunnel egress as "simple tunnels". + + ECN uses the ECN field in the IP header for signaling between routers + and connection endpoints. ECN interacts with IP tunnels based on the + treatment of the ECN field in the IP header. In simple IP tunnels + the octet containing the ECN field is copied or mapped from the inner + IP header to the outer IP header at IP tunnel ingress, and the outer + header's copy of this field is discarded at IP tunnel egress. If the + outer header were to be simply discarded without taking care to deal + with the ECN field, and an ECN-capable router were to set the CE + + + +Ramakrishnan, et al. Standards Track [Page 25] + +RFC 3168 The Addition of ECN to IP September 2001 + + + (Congestion Experienced) codepoint within a packet in a simple IP + tunnel, this indication would be discarded at tunnel egress, losing + the indication of congestion. + + Thus, the use of ECN over simple IP tunnels would result in routers + attempting to use the outer IP header to signal congestion to + endpoints, but those congestion warnings never arriving because the + outer header is discarded at the tunnel egress point. This problem + was encountered with ECN and IPsec in tunnel mode, and RFC 2481 + recommended that ECN not be used with the older simple IPsec tunnels + in order to avoid this behavior and its consequences. When ECN + becomes widely deployed, then simple tunnels likely to carry ECN- + capable traffic will have to be changed. If ECN-capable traffic is + carried by a simple tunnel through a congested, ECN-capable router, + this could result in subsequent packets being dropped for this flow + as the average queue size increases at the congested router, as + discussed in Section 8 above. + + From a security point of view, the use of ECN in the outer header of + an IP tunnel might raise security concerns because an adversary could + tamper with the ECN information that propagates beyond the tunnel + endpoint. Based on an analysis in Sections 18 and 19 of these + concerns and the resultant risks, our overall approach is to make + support for ECN an option for IP tunnels, so that an IP tunnel can be + specified or configured either to use ECN or not to use ECN in the + outer header of the tunnel. Thus, in environments or tunneling + protocols where the risks of using ECN are judged to outweigh its + benefits, the tunnel can simply not use ECN in the outer header. + Then the only indication of congestion experienced at routers within + the tunnel would be through packet loss. + + The result is that there are two viable options for the behavior of + ECN-capable connections over an IP tunnel, including IPsec tunnels: + + * A limited-functionality option in which ECN is preserved in the + inner header, but disabled in the outer header. The only + mechanism available for signaling congestion occurring within + the tunnel in this case is dropped packets. + + * A full-functionality option that supports ECN in both the inner + and outer headers, and propagates congestion warnings from nodes + within the tunnel to endpoints. + + Support for these options requires varying amounts of changes to IP + header processing at tunnel ingress and egress. A small subset of + these changes sufficient to support only the limited-functionality + option would be sufficient to eliminate any incompatibility between + ECN and IP tunnels. + + + +Ramakrishnan, et al. Standards Track [Page 26] + +RFC 3168 The Addition of ECN to IP September 2001 + + + One goal of this document is to give guidance about the tradeoffs + between the limited-functionality and full-functionality options. A + full discussion of the potential effects of an adversary's + modifications of the ECN field is given in Sections 18 and 19. + +9.1.1. The Limited-functionality and Full-functionality Options + + The limited-functionality option for ECN encapsulation in IP tunnels + is for the not-ECT codepoint to be set in the outside (encapsulating) + header regardless of the value of the ECN field in the inside + (encapsulated) header. With this option, the ECN field in the inner + header is not altered upon de-capsulation. The disadvantage of this + approach is that the flow does not have ECN support for that part of + the path that is using IP tunneling, even if the encapsulated packet + (from the original TCP sender) is ECN-Capable. That is, if the + encapsulated packet arrives at a congested router that is ECN- + capable, and the router can decide to drop or mark the packet as an + indication of congestion to the end nodes, the router will not be + permitted to set the CE codepoint in the packet header, but instead + will have to drop the packet. + + The full-functionality option for ECN encapsulation is to copy the + ECN codepoint of the inside header to the outside header on + encapsulation if the inside header is not-ECT or ECT, and to set the + ECN codepoint of the outside header to ECT(0) if the ECN codepoint of + the inside header is CE. On decapsulation, if the CE codepoint is + set on the outside header, then the CE codepoint is also set in the + inner header. Otherwise, the ECN codepoint on the inner header is + left unchanged. That is, for full ECN support the encapsulation and + decapsulation processing involves the following: At tunnel ingress, + the full-functionality option sets the ECN codepoint in the outer + header. If the ECN codepoint in the inner header is not-ECT or ECT, + then it is copied to the ECN codepoint in the outer header. If the + ECN codepoint in the inner header is CE, then the ECN codepoint in + the outer header is set to ECT(0). Upon decapsulation at the tunnel + egress, the full-functionality option sets the CE codepoint in the + inner header if the CE codepoint is set in the outer header. + Otherwise, no change is made to this field of the inner header. + + With the full-functionality option, a flow can take advantage of ECN + in those parts of the path that might use IP tunneling. The + disadvantage of the full-functionality option from a security + perspective is that the IP tunnel cannot protect the flow from + certain modifications to the ECN bits in the IP header within the + tunnel. The potential dangers from modifications to the ECN bits in + the IP header are described in detail in Sections 18 and 19. + + + + + +Ramakrishnan, et al. Standards Track [Page 27] + +RFC 3168 The Addition of ECN to IP September 2001 + + + (1) An IP tunnel MUST modify the handling of the DS field octet at + IP tunnel endpoints by implementing either the limited- + functionality or the full-functionality option. + + (2) Optionally, an IP tunnel MAY enable the endpoints of an IP + tunnel to negotiate the choice between the limited-functionality + and the full-functionality option for ECN in the tunnel. + + The minimum required to make ECN usable with IP tunnels is the + limited-functionality option, which prevents ECN from being enabled + in the outer header of the tunnel. Full support for ECN requires the + use of the full-functionality option. If there are no optional + mechanisms for the tunnel endpoints to negotiate a choice between the + limited-functionality or full-functionality option, there can be a + pre-existing agreement between the tunnel endpoints about whether to + support the limited-functionality or the full-functionality ECN + option. + + All IP tunnels MUST implement the limited-functionality option, and + SHOULD support the full-functionality option. + + In addition, it is RECOMMENDED that packets with the CE codepoint in + the outer header be dropped if they arrive at the tunnel egress point + for a tunnel that uses the limited-functionality option, or for a + tunnel that uses the full-functionality option but for which the + not-ECT codepoint is set in the inner header. This is motivated by + backwards compatibility and to ensure that no unauthorized + modifications of the ECN field take place, and is discussed further + in the next Section (9.1.2). + +9.1.2. Changes to the ECN Field within an IP Tunnel. + + The presence of a copy of the ECN field in the inner header of an IP + tunnel mode packet provides an opportunity for detection of + unauthorized modifications to the ECN field in the outer header. + Comparison of the ECT fields in the inner and outer headers falls + into two categories for implementations that conform to this + document: + + * If the IP tunnel uses the full-functionality option, then the + not-ECT codepoint should be set in the outer header if and only + if it is also set in the inner header. + + * If the tunnel uses the limited-functionality option, then the + not-ECT codepoint should be set in the outer header. + + Receipt of a packet not satisfying the appropriate condition could be + a cause of concern. + + + +Ramakrishnan, et al. Standards Track [Page 28] + +RFC 3168 The Addition of ECN to IP September 2001 + + + Consider the case of an IP tunnel where the tunnel ingress point has + not been updated to this document's requirements, while the tunnel + egress point has been updated to support ECN. In this case, the IP + tunnel is not explicitly configured to support the full-functionality + ECN option. However, the tunnel ingress point is behaving identically + to a tunnel ingress point that supports the full-functionality + option. If packets from an ECN-capable connection use this tunnel, + the ECT codepoint will be set in the outer header at the tunnel + ingress point. Congestion within the tunnel may then result in ECN- + capable routers setting CE in the outer header. Because the tunnel + has not been explicitly configured to support the full-functionality + option, the tunnel egress point expects the not-ECT codepoint to be + set in the outer header. When an ECN-capable tunnel egress point + receives a packet with the ECT or CE codepoint in the outer header, + in a tunnel that has not been configured to support the full- + functionality option, that packet should be processed, according to + whether the CE codepoint was set, as follows. It is RECOMMENDED that + on a tunnel that has not been configured to support the full- + functionality option, packets should be dropped at the egress point + if the CE codepoint is set in the outer header but not in the inner + header, and should be forwarded otherwise. + + An IP tunnel cannot provide protection against erasure of congestion + indications based on changing the ECN codepoint from CE to ECT. The + erasure of congestion indications may impact the network and other + flows in ways that would not be possible in the absence of ECN. It + is important to note that erasure of congestion indications can only + be performed to congestion indications placed by nodes within the + tunnel; the copy of the ECN field in the inner header preserves + congestion notifications from nodes upstream of the tunnel ingress + (unless the inner header is also erased). If erasure of congestion + notifications is judged to be a security risk that exceeds the + congestion management benefits of ECN, then tunnels could be + specified or configured to use the limited-functionality option. + +9.2. IPsec Tunnels + + IPsec supports secure communication over potentially insecure network + components such as intermediate routers. IPsec protocols support two + operating modes, transport mode and tunnel mode, that span a wide + range of security requirements and operating environments. Transport + mode security protocol header(s) are inserted between the IP (IPv4 or + IPv6) header and higher layer protocol headers (e.g., TCP), and hence + transport mode can only be used for end-to-end security on a + connection. IPsec tunnel mode is based on adding a new "outer" IP + header that encapsulates the original, or "inner" IP header and its + associated packet. Tunnel mode security headers are inserted between + these two IP headers. In contrast to transport mode, the new "outer" + + + +Ramakrishnan, et al. Standards Track [Page 29] + +RFC 3168 The Addition of ECN to IP September 2001 + + + IP header and tunnel mode security headers can be added and removed + at intermediate points along a connection, enabling security gateways + to secure vulnerable portions of a connection without requiring + endpoint participation in the security protocols. An important + aspect of tunnel mode security is that in the original specification, + the outer header is discarded at tunnel egress, ensuring that + security threats based on modifying the IP header do not propagate + beyond that tunnel endpoint. Further discussion of IPsec can be + found in [RFC2401]. + + The IPsec protocol as originally defined in [ESP, AH] required that + the inner header's ECN field not be changed by IPsec decapsulation + processing at a tunnel egress node; this would have ruled out the + possibility of full-functionality mode for ECN. At the same time, + this would ensure that an adversary's modifications to the ECN field + cannot be used to launch theft- or denial-of-service attacks across + an IPsec tunnel endpoint, as any such modifications will be discarded + at the tunnel endpoint. + + In principle, permitting the use of ECN functionality in the outer + header of an IPsec tunnel raises security concerns because an + adversary could tamper with the information that propagates beyond + the tunnel endpoint. Based on an analysis (included in Sections 18 + and 19) of these concerns and the associated risks, our overall + approach has been to provide configuration support for IPsec changes + to remove the conflict with ECN. + + In particular, in tunnel mode the IPsec tunnel MUST support the + limited-functionality option outlined in Section 9.1.1, and SHOULD + support the full-functionality option outlined in Section 9.1.1. + + This makes permission to use ECN functionality in the outer header of + an IPsec tunnel a configurable part of the corresponding IPsec + Security Association (SA), so that it can be disabled in situations + where the risks are judged to outweigh the benefits. The result is + that an IPsec security administrator is presented with two + alternatives for the behavior of ECN-capable connections within an + IPsec tunnel, the limited-functionality alternative and full- + functionality alternative described earlier. + + In addition, this document specifies how the endpoints of an IPsec + tunnel could negotiate enabling ECN functionality in the outer + headers of that tunnel based on security policy. The ability to + negotiate ECN usage between tunnel endpoints would enable a security + administrator to disable ECN in situations where she believes the + risks (e.g., of lost congestion notifications) outweigh the benefits + of ECN. + + + + +Ramakrishnan, et al. Standards Track [Page 30] + +RFC 3168 The Addition of ECN to IP September 2001 + + + The IPsec protocol, as defined in [ESP, AH], does not include the IP + header's ECN field in any of its cryptographic calculations (in the + case of tunnel mode, the outer IP header's ECN field is not + included). Hence modification of the ECN field by a network node has + no effect on IPsec's end-to-end security, because it cannot cause any + IPsec integrity check to fail. As a consequence, IPsec does not + provide any defense against an adversary's modification of the ECN + field (i.e., a man-in-the-middle attack), as the adversary's + modification will also have no effect on IPsec's end-to-end security. + In some environments, the ability to modify the ECN field without + affecting IPsec integrity checks may constitute a covert channel; if + it is necessary to eliminate such a channel or reduce its bandwidth, + then the IPsec tunnel should be run in limited-functionality mode. + +9.2.1. Negotiation between Tunnel Endpoints + + This section describes the detailed changes to enable usage of ECN + over IPsec tunnels, including the negotiation of ECN support between + tunnel endpoints. This is supported by three changes to IPsec: + + * An optional Security Association Database (SAD) field indicating + whether tunnel encapsulation and decapsulation processing allows + or forbids ECN usage in the outer IP header. + + * An optional Security Association Attribute that enables + negotiation of this SAD field between the two endpoints of an SA + that supports tunnel mode. + + * Changes to tunnel mode encapsulation and decapsulation + processing to allow or forbid ECN usage in the outer IP header + based on the value of the SAD field. When ECN usage is allowed + in the outer IP header, the ECT codepoint is set in the outer + header for ECN-capable connections and congestion notifications + (indicated by the CE codepoint) from such connections are + propagated to the inner header at tunnel egress. + + If negotiation of ECN usage is implemented, then the SAD field SHOULD + also be implemented. On the other hand, negotiation of ECN usage is + OPTIONAL in all cases, even for implementations that support the SAD + field. The encapsulation and decapsulation processing changes are + REQUIRED, but MAY be implemented without the other two changes by + assuming that ECN usage is always forbidden. The full-functionality + alternative for ECN usage over IPsec tunnels consists of the SAD + field and the full version of encapsulation and decapsulation + processing changes, with or without the OPTIONAL negotiation support. + The limited-functionality alternative consists of a subset of the + encapsulation and decapsulation changes that always forbids ECN + usage. + + + +Ramakrishnan, et al. Standards Track [Page 31] + +RFC 3168 The Addition of ECN to IP September 2001 + + + These changes are covered further in the following three subsections. + +9.2.1.1. ECN Tunnel Security Association Database Field + + Full ECN functionality adds a new field to the SAD (see [RFC2401]): + + ECN Tunnel: allowed or forbidden. + + Indicates whether ECN-capable connections using this SA in tunnel + mode are permitted to receive ECN congestion notifications for + congestion occurring within the tunnel. The allowed value enables + ECN congestion notifications. The forbidden value disables such + notifications, causing all congestion to be indicated via dropped + packets. + + [OPTIONAL. The value of this field SHOULD be assumed to be + "forbidden" in implementations that do not support it.] + + If this attribute is implemented, then the SA specification in a + Security Policy Database (SPD) entry MUST support a corresponding + attribute, and this SPD attribute MUST be covered by the SPD + administrative interface (currently described in Section 4.4.1 of + [RFC2401]). + +9.2.1.2. ECN Tunnel Security Association Attribute + + A new IPsec Security Association Attribute is defined to enable the + support for ECN congestion notifications based on the outer IP header + to be negotiated for IPsec tunnels (see [RFC2407]). This attribute + is OPTIONAL, although implementations that support it SHOULD also + support the SAD field defined in Section 9.2.1.1. + + Attribute Type + + class value type + ------------------------------------------------- + ECN Tunnel 10 Basic + + The IPsec SA Attribute value 10 has been allocated by IANA to + indicate that the ECN Tunnel SA Attribute is being negotiated; the + type of this attribute is Basic (see Section 4.5 of [RFC2407]). The + Class Values are used to conduct the negotiation. See [RFC2407, + RFC2408, RFC2409] for further information including encoding formats + and requirements for negotiating this SA attribute. + + + + + + + +Ramakrishnan, et al. Standards Track [Page 32] + +RFC 3168 The Addition of ECN to IP September 2001 + + + Class Values + + ECN Tunnel + + Specifies whether ECN functionality is allowed to be used with Tunnel + Encapsulation Mode. This affects tunnel encapsulation and + decapsulation processing - see Section 9.2.1.3. + + RESERVED 0 + Allowed 1 + Forbidden 2 + + Values 3-61439 are reserved to IANA. Values 61440-65535 are for + private use. + + If unspecified, the default shall be assumed to be Forbidden. + + ECN Tunnel is a new SA attribute, and hence initiators that use it + can expect to encounter responders that do not understand it, and + therefore reject proposals containing it. For backwards + compatibility with such implementations initiators SHOULD always also + include a proposal without the ECN Tunnel attribute to enable such a + responder to select a transform or proposal that does not contain the + ECN Tunnel attribute. RFC 2407 currently requires responders to + reject all proposals if any proposal contains an unknown attribute; + this requirement is expected to be changed to require a responder not + to select proposals or transforms containing unknown attributes. + +9.2.1.3. Changes to IPsec Tunnel Header Processing + + For full ECN support, the encapsulation and decapsulation processing + for the IPv4 TOS field and the IPv6 Traffic Class field are changed + from that specified in [RFC2401] to the following: + + <-- How Outer Hdr Relates to Inner Hdr --> + Outer Hdr at Inner Hdr at + IPv4 Encapsulator Decapsulator + Header fields: -------------------- ------------ + DS Field copied from inner hdr (5) no change + ECN Field constructed (7) constructed (8) + + IPv6 + Header fields: + DS Field copied from inner hdr (6) no change + ECN Field constructed (7) constructed (8) + + + + + + +Ramakrishnan, et al. Standards Track [Page 33] + +RFC 3168 The Addition of ECN to IP September 2001 + + + (5)(6) If the packet will immediately enter a domain for which the + DSCP value in the outer header is not appropriate, that value MUST + be mapped to an appropriate value for the domain [RFC 2474]. Also + see [RFC 2475] for further information. + + (7) If the value of the ECN Tunnel field in the SAD entry for this + SA is "allowed" and the ECN field in the inner header is set to + any value other than CE, copy this ECN field to the outer header. + If the ECN field in the inner header is set to CE, then set the + ECN field in the outer header to ECT(0). + + (8) If the value of the ECN tunnel field in the SAD entry for this + SA is "allowed" and the ECN field in the inner header is set to + ECT(0) or ECT(1) and the ECN field in the outer header is set to + CE, then copy the ECN field from the outer header to the inner + header. Otherwise, make no change to the ECN field in the inner + header. + + (5) and (6) are identical to match usage in [RFC2401], although + they are different in [RFC2401]. + + The above description applies to implementations that support the ECN + Tunnel field in the SAD; such implementations MUST implement this + processing instead of the processing of the IPv4 TOS octet and IPv6 + Traffic Class octet defined in [RFC2401]. This constitutes the + full-functionality alternative for ECN usage with IPsec tunnels. + + An implementation that does not support the ECN Tunnel field in the + SAD MUST implement this processing by assuming that the value of the + ECN Tunnel field of the SAD is "forbidden" for every SA. In this + case, the processing of the ECN field reduces to: + + (7) Set the ECN field to not-ECT in the outer header. + (8) Make no change to the ECN field in the inner header. + + This constitutes the limited functionality alternative for ECN usage + with IPsec tunnels. + + For backwards compatibility, packets with the CE codepoint set in the + outer header SHOULD be dropped if they arrive on an SA that is using + the limited-functionality option, or that is using the full- + functionality option with the not-ECN codepoint set in the inner + header. + + + + + + + + +Ramakrishnan, et al. Standards Track [Page 34] + +RFC 3168 The Addition of ECN to IP September 2001 + + +9.2.2. Changes to the ECN Field within an IPsec Tunnel. + + If the ECN Field is changed inappropriately within an IPsec tunnel, + and this change is detected at the tunnel egress, then the receipt of + a packet not satisfying the appropriate condition for its SA is an + auditable event. An implementation MAY create audit records with + per-SA counts of incorrect packets over some time period rather than + creating an audit record for each erroneous packet. Any such audit + record SHOULD contain the headers from at least one erroneous packet, + but need not contain the headers from every packet represented by the + entry. + +9.2.3. Comments for IPsec Support + + Substantial comments were received on two areas of this document + during review by the IPsec working group. This section describes + these comments and explains why the proposed changes were not + incorporated. + + The first comment indicated that per-node configuration is easier to + implement than per-SA configuration. After serious thought and + despite some initial encouragement of per-node configuration, it no + longer seems to be a good idea. The concern is that as ECN-awareness + is progressively deployed in IPsec, many ECN-aware IPsec + implementations will find themselves communicating with a mixture of + ECN-aware and ECN-unaware IPsec tunnel endpoints. In such an + environment with per-node configuration, the only reasonable thing to + do is forbid ECN usage for all IPsec tunnels, which is not the + desired outcome. + + In the second area, several reviewers noted that SA negotiation is + complex, and adding to it is non-trivial. One reviewer suggested + using ICMP after tunnel setup as a possible alternative. The + addition to SA negotiation in this document is OPTIONAL and will + remain so; implementers are free to ignore it. The authors believe + that the assurance it provides can be useful in a number of + situations. In practice, if this is not implemented, it can be + deleted at a subsequent stage in the standards process. Extending + ICMP to negotiate ECN after tunnel setup is more complex than + extending SA attribute negotiation. Some tunnels do not permit + traffic to be addressed to the tunnel egress endpoint, hence the ICMP + packet would have to be addressed to somewhere else, scanned for by + the egress endpoint, and discarded there or at its actual + destination. In addition, ICMP delivery is unreliable, and hence + there is a possibility of an ICMP packet being dropped, entailing the + invention of yet another ack/retransmit mechanism. It seems better + simply to specify an OPTIONAL extension to the existing SA + negotiation mechanism. + + + +Ramakrishnan, et al. Standards Track [Page 35] + +RFC 3168 The Addition of ECN to IP September 2001 + + +9.3. IP packets encapsulated in non-IP Packet Headers. + + A different set of issues are raised, relative to ECN, when IP + packets are encapsulated in tunnels with non-IP packet headers. This + occurs with MPLS [MPLS], GRE [GRE], L2TP [L2TP], and PPTP [PPTP]. + For these protocols, there is no conflict with ECN; it is just that + ECN cannot be used within the tunnel unless an ECN codepoint can be + specified for the header of the encapsulating protocol. Earlier work + considered a preliminary proposal for incorporating ECN into MPLS, + and proposals for incorporating ECN into GRE, L2TP, or PPTP will be + considered as the need arises. + +10. Issues Raised by Monitoring and Policing Devices + + One possibility is that monitoring and policing devices (or more + informally, "penalty boxes") will be installed in the network to + monitor whether best-effort flows are appropriately responding to + congestion, and to preferentially drop packets from flows determined + not to be using adequate end-to-end congestion control procedures. + + We recommend that any "penalty box" that detects a flow or an + aggregate of flows that is not responding to end-to-end congestion + control first change from marking to dropping packets from that flow, + before taking any additional action to restrict the bandwidth + available to that flow. Thus, initially, the router may drop packets + in which the router would otherwise would have set the CE codepoint. + This could include dropping those arriving packets for that flow that + are ECN-Capable and that already have the CE codepoint set. In this + way, any congestion indications seen by that router for that flow + will be guaranteed to also be seen by the end nodes, even in the + presence of malicious or broken routers elsewhere in the path. If we + assume that the first action taken at any "penalty box" for an ECN- + capable flow will be to drop packets instead of marking them, then + there is no way that an adversary that subverts ECN-based end-to-end + congestion control can cause a flow to be characterized as being + non-cooperative and placed into a more severe action within the + "penalty box". + + The monitoring and policing devices that are actually deployed could + fall short of the `ideal' monitoring device described above, in that + the monitoring is applied not to a single flow, but to an aggregate + of flows (e.g., those sharing a single IPsec tunnel). In this case, + the switch from marking to dropping would apply to all of the flows + in that aggregate, denying the benefits of ECN to the other flows in + the aggregate also. At the highest level of aggregation, another + form of the disabling of ECN happens even in the absence of + + + + + +Ramakrishnan, et al. Standards Track [Page 36] + +RFC 3168 The Addition of ECN to IP September 2001 + + + monitoring and policing devices, when ECN-Capable RED queues switch + from marking to dropping packets as an indication of congestion when + the average queue size has exceeded some threshold. + +11. Evaluations of ECN + +11.1. Related Work Evaluating ECN + + This section discusses some of the related work evaluating the use of + ECN. The ECN Web Page [ECN] has pointers to other papers, as well as + to implementations of ECN. + + [Floyd94] considers the advantages and drawbacks of adding ECN to the + TCP/IP architecture. As shown in the simulation-based comparisons, + one advantage of ECN is to avoid unnecessary packet drops for short + or delay-sensitive TCP connections. A second advantage of ECN is in + avoiding some unnecessary retransmit timeouts in TCP. This paper + discusses in detail the integration of ECN into TCP's congestion + control mechanisms. The possible disadvantages of ECN discussed in + the paper are that a non-compliant TCP connection could falsely + advertise itself as ECN-capable, and that a TCP ACK packet carrying + an ECN-Echo message could itself be dropped in the network. The + first of these two issues is discussed in the appendix of this + document, and the second is addressed by the addition of the CWR flag + in the TCP header. + + Experimental evaluations of ECN include [RFC2884,K98]. The + conclusions of [K98] and [RFC2884] are that ECN TCP gets moderately + better throughput than non-ECN TCP; that ECN TCP flows are fair + towards non-ECN TCP flows; and that ECN TCP is robust with two-way + traffic (with congestion in both directions) and with multiple + congested gateways. Experiments with many short web transfers show + that, while most of the short connections have similar transfer times + with or without ECN, a small percentage of the short connections have + very long transfer times for the non-ECN experiments as compared to + the ECN experiments. + +11.2. A Discussion of the ECN nonce. + + The use of two ECT codepoints, ECT(0) and ECT(1), can provide a one- + bit ECN nonce in packet headers [SCWA99]. The primary motivation for + this is the desire to allow mechanisms for the data sender to verify + that network elements are not erasing the CE codepoint, and that data + receivers are properly reporting to the sender the receipt of packets + with the CE codepoint set, as required by the transport protocol. + This section discusses issues of backwards compatibility with IP ECN + implementations in routers conformant with RFC 2481, in which only + one ECT codepoint was defined. We do not believe that the + + + +Ramakrishnan, et al. Standards Track [Page 37] + +RFC 3168 The Addition of ECN to IP September 2001 + + + incremental deployment of ECN implementations that understand the + ECT(1) codepoint will cause significant operational problems. This + is particularly likely to be the case when the deployment of the + ECT(1) codepoint begins with routers, before the ECT(1) codepoint + starts to be used by end-nodes. + +11.2.1. The Incremental Deployment of ECT(1) in Routers. + + ECN has been an Experimental standard since January 1999, and there + are already implementations of ECN in routers that do not understand + the ECT(1) codepoint. When the use of the ECT(1) codepoint is + standardized for TCP or for other transport protocols, this could + mean that a data sender is using the ECT(1) codepoint, but that this + codepoint is not understood by a congested router on the path. + + If allowed by the transport protocol, a data sender would be free not + to make use of ECT(1) at all, and to send all ECN-capable packets + with the codepoint ECT(0). However, if an ECN-capable sender is + using ECT(1), and the congested router on the path did not understand + the ECT(1) codepoint, then the router would end up marking some of + the ECT(0) packets, and dropping some of the ECT(1) packets, as + indications of congestion. Since TCP is required to react to both + marked and dropped packets, this behavior of dropping packets that + could have been marked poses no significant threat to the network, + and is consistent with the overall approach to ECN that allows + routers to determine when and whether to mark packets as they see fit + (see Section 5). + +12. Summary of changes required in IP and TCP + + This document specified two bits in the IP header to be used for ECN. + The not-ECT codepoint indicates that the transport protocol will + ignore the CE codepoint. This is the default value for the ECN + codepoint. The ECT codepoints indicate that the transport protocol + is willing and able to participate in ECN. + + The router sets the CE codepoint to indicate congestion to the end + nodes. The CE codepoint in a packet header MUST NOT be reset by a + router. + + TCP requires three changes for ECN, a setup phase and two new flags + in the TCP header. The ECN-Echo flag is used by the data receiver to + inform the data sender of a received CE packet. The Congestion + Window Reduced (CWR) flag is used by the data sender to inform the + data receiver that the congestion window has been reduced. + + + + + + +Ramakrishnan, et al. Standards Track [Page 38] + +RFC 3168 The Addition of ECN to IP September 2001 + + + When ECN (Explicit Congestion Notification) is used, it is required + that congestion indications generated within an IP tunnel not be lost + at the tunnel egress. We specified a minor modification to the IP + protocol's handling of the ECN field during encapsulation and de- + capsulation to allow flows that will undergo IP tunneling to use ECN. + + Two options for ECN in tunnels were specified: + + 1) A limited-functionality option that does not use ECN inside the IP + tunnel, by setting the ECN field in the outer header to not-ECT, and + not altering the inner header at the time of decapsulation. + + 2) The full-functionality option, which sets the ECN field in the + outer header to either not-ECT or to one of the ECT codepoints, + depending on the ECN field in the inner header. At decapsulation, if + the CE codepoint is set in the outer header, and the inner header is + set to one of the ECT codepoints, then the CE codepoint is copied to + the inner header. + + For IPsec tunnels, this document also defines an optional IPsec + Security Association (SA) attribute that enables negotiation of ECN + usage within IPsec tunnels and an optional field in the Security + Association Database to indicate whether ECN is permitted in tunnel + mode on a SA. The required changes to IPsec tunnels for ECN usage + modify RFC 2401 [RFC2401], which defines the IPsec architecture and + specifies some aspects of its implementation. The new IPsec SA + attribute is in addition to those already defined in Section 4.5 of + [RFC2407]. + + This document obsoletes RFC 2481, "A Proposal to add Explicit + Congestion Notification (ECN) to IP", which defined ECN as an + Experimental Protocol for the Internet Community. The rest of this + section describes the relationship between this document and its + predecessor. + + RFC 2481 included a brief discussion of the use of ECN with + encapsulated packets, and noted that for the IPsec specifications at + the time (January 1999), flows could not safely use ECN if they were + to traverse IPsec tunnels. RFC 2481 also described the changes that + could be made to IPsec tunnel specifications to made them compatible + with ECN. + + This document also incorporates work that was done after RFC 2481. + First was to describe the changes to IPsec tunnels in detail, and + extensively discuss the security implications of ECN (now included as + Sections 18 and 19 of this document). Second was to extend the + discussion of IPsec tunnels to include all IP tunnels. Because older + IP tunnels are not compatible with a flow's use of ECN, the + + + +Ramakrishnan, et al. Standards Track [Page 39] + +RFC 3168 The Addition of ECN to IP September 2001 + + + deployment of ECN in the Internet will create strong pressure for + older IP tunnels to be updated to an ECN-compatible version, using + either the limited-functionality or the full-functionality option. + + This document does not address the issue of including ECN in non-IP + tunnels such as MPLS, GRE, L2TP, or PPTP. An earlier preliminary + document about adding ECN support to MPLS was not advanced. + + A third new piece of work after RFC2481 was to describe the ECN + procedure with retransmitted data packets, that an ECT codepoint + should not be set on retransmitted data packets. The motivation for + this additional specification is to eliminate a possible avenue for + denial-of-service attacks on an existing TCP connection. Some prior + deployments of ECN-capable TCP might not conform to the (new) + requirement not to set an ECT codepoint on retransmitted packets; we + do not believe this will cause significant problems in practice. + + This document also expands slightly on the specification of the use + of SYN packets for the negotiation of ECN. While some prior + deployments of ECN-capable TCP might not conform to the requirements + specified in this document, we do not believe that this will lead to + any performance or compatibility problems for TCP connections with a + combination of TCP implementations at the endpoints. + + This document also includes the specification of the ECT(1) + codepoint, which may be used by TCP as part of the implementation of + an ECN nonce. + +13. Conclusions + + Given the current effort to implement AQM, we believe this is the + right time to deploy congestion avoidance mechanisms that do not + depend on packet drops alone. With the increased deployment of + applications and transports sensitive to the delay and loss of a + single packet (e.g., realtime traffic, short web transfers), + depending on packet loss as a normal congestion notification + mechanism appears to be insufficient (or at the very least, non- + optimal). + + We examined the consequence of modifications of the ECN field within + the network, analyzing all the opportunities for an adversary to + change the ECN field. In many cases, the change to the ECN field is + no worse than dropping a packet. However, we noted that some changes + have the more serious consequence of subverting end-to-end congestion + control. However, we point out that even then the potential damage + is limited, and is similar to the threat posed by end-systems + intentionally failing to cooperate with end-to-end congestion + control. + + + +Ramakrishnan, et al. Standards Track [Page 40] + +RFC 3168 The Addition of ECN to IP September 2001 + + +14. Acknowledgements + + Many people have made contributions to this work and this document, + including many that we have not managed to directly acknowledge in + this document. In addition, we would like to thank Kenjiro Cho for + the proposal for the TCP mechanism for negotiating ECN-Capability, + Kevin Fall for the proposal of the CWR bit, Steve Blake for material + on IPv4 Header Checksum Recalculation, Jamal Hadi-Salim for + discussions of ECN issues, and Steve Bellovin, Jim Bound, Brian + Carpenter, Paul Ferguson, Stephen Kent, Greg Minshall, and Vern + Paxson for discussions of security issues. We also thank the + Internet End-to-End Research Group for ongoing discussions of these + issues. + + Email discussions with a number of people, including Dax Kelson, + Alexey Kuznetsov, Jamal Hadi-Salim, and Venkat Venkatsubra, have + addressed the issues raised by non-conformant equipment in the + Internet that does not respond to TCP SYN packets with the ECE and + CWR flags set. We thank Mark Handley, Jitentra Padhye, and others + for discussions on the TCP initialization procedures. + + The discussion of ECN and IP tunnel considerations draws heavily on + related discussions and documents from the Differentiated Services + Working Group. We thank Tabassum Bint Haque from Dhaka, Bangladesh, + for feedback on IP tunnels. We thank Derrell Piper and Kero Tivinen + for proposing modifications to RFC 2407 that improve the usability of + negotiating the ECN Tunnel SA attribute. + + We thank David Wetherall, David Ely, and Neil Spring for the proposal + for the ECN nonce. We also thank Stefan Savage for discussions on + this issue. We thank Bob Briscoe and Jon Crowcroft for raising the + issue of fragmentation in IP, on alternate semantics for the fourth + ECN codepoint, and several other topics. We thank Richard Wendland + for feedback on several issues in the document. + + We also thank the IESG, and in particular the Transport Area + Directors over the years, for their feedback and their work towards + the standardization of ECN. + +15. References + + [AH] Kent, S. and R. Atkinson, "IP Authentication Header", + RFC 2402, November 1998. + + [ECN] "The ECN Web Page", URL + "http://www.aciri.org/floyd/ecn.html". Reference for + informational purposes only. + + + + +Ramakrishnan, et al. Standards Track [Page 41] + +RFC 3168 The Addition of ECN to IP September 2001 + + + [ESP] Kent, S. and R. Atkinson, "IP Encapsulating Security + Payload", RFC 2406, November 1998. + + [FIXES] ECN-under-Linux Unofficial Vendor Support Page, URL + "http://gtf.org/garzik/ecn/". Reference for + informational purposes only. + + [FJ93] Floyd, S., and Jacobson, V., "Random Early Detection + gateways for Congestion Avoidance", IEEE/ACM + Transactions on Networking, V.1 N.4, August 1993, p. + 397-413. + + [Floyd94] Floyd, S., "TCP and Explicit Congestion Notification", + ACM Computer Communication Review, V. 24 N. 5, October + 1994, p. 10-23. + + [Floyd98] Floyd, S., "The ECN Validation Test in the NS + Simulator", URL "http://www-mash.cs.berkeley.edu/ns/", + test tcl/test/test-all- ecn. Reference for + informational purposes only. + + [FF99] Floyd, S., and Fall, K., "Promoting the Use of End-to- + End Congestion Control in the Internet", IEEE/ACM + Transactions on Networking, August 1999. + + [FRED] Lin, D., and Morris, R., "Dynamics of Random Early + Detection", SIGCOMM '97, September 1997. + + [GRE] Hanks, S., Li, T., Farinacci, D. and P. Traina, "Generic + Routing Encapsulation (GRE)", RFC 1701, October 1994. + + [Jacobson88] V. Jacobson, "Congestion Avoidance and Control", Proc. + ACM SIGCOMM '88, pp. 314-329. + + [Jacobson90] V. Jacobson, "Modified TCP Congestion Avoidance + Algorithm", Message to end2end-interest mailing list, + April 1990. URL + "ftp://ftp.ee.lbl.gov/email/vanj.90apr30.txt". + + [K98] Krishnan, H., "Analyzing Explicit Congestion + Notification (ECN) benefits for TCP", Master's thesis, + UCLA, 1998. Citation for acknowledgement purposes only. + + [L2TP] Townsley, W., Valencia, A., Rubens, A., Pall, G., Zorn, + G. and B. Palter, "Layer Two Tunneling Protocol "L2TP"", + RFC 2661, August 1999. + + + + + +Ramakrishnan, et al. Standards Track [Page 42] + +RFC 3168 The Addition of ECN to IP September 2001 + + + [MJV96] S. McCanne, V. Jacobson, and M. Vetterli, "Receiver- + driven Layered Multicast", SIGCOMM '96, August 1996, pp. + 117-130. + + [MPLS] Awduche, D., Malcolm, J., Agogbua, J., O'Dell, M. and J. + McManus, Requirements for Traffic Engineering Over MPLS, + RFC 2702, September 1999. + + [PPTP] Hamzeh, K., Pall, G., Verthein, W., Taarud, J., Little, + W. and G. Zorn, "Point-to-Point Tunneling Protocol + (PPTP)", RFC 2637, July 1999. + + [RFC791] Postel, J., "Internet Protocol", STD 5, RFC 791, + September 1981. + + [RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC + 793, September 1981. + + [RFC1141] Mallory, T. and A. Kullberg, "Incremental Updating of + the Internet Checksum", RFC 1141, January 1990. + + [RFC1349] Almquist, P., "Type of Service in the Internet Protocol + Suite", RFC 1349, July 1992. + + [RFC1455] Eastlake, D., "Physical Link Security Type of Service", + RFC 1455, May 1993. + + [RFC1701] Hanks, S., Li, T., Farinacci, D. and P. Traina, "Generic + Routing Encapsulation (GRE)", RFC 1701, October 1994. + + [RFC1702] Hanks, S., Li, T., Farinacci, D. and P. Traina, "Generic + Routing Encapsulation over IPv4 networks", RFC 1702, + October 1994. + + [RFC2003] Perkins, C., "IP Encapsulation within IP", RFC 2003, + October 1996. + + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate + Requirement Levels", BCP 14, RFC 2119, March 1997. + + [RFC2309] Braden, B., et al., "Recommendations on Queue Management + and Congestion Avoidance in the Internet", RFC 2309, + April 1998. + + [RFC2401] Kent, S. and R. Atkinson, Security Architecture for the + Internet Protocol, RFC 2401, November 1998. + + + + + +Ramakrishnan, et al. Standards Track [Page 43] + +RFC 3168 The Addition of ECN to IP September 2001 + + + [RFC2407] Piper, D., "The Internet IP Security Domain of + Interpretation for ISAKMP", RFC 2407, November 1998. + + [RFC2408] Maughan, D., Schertler, M., Schneider, M. and J. Turner, + "Internet Security Association and Key Management + Protocol (ISAKMP)", RFC 2409, November 1998. + + [RFC2409] Harkins D. and D. Carrel, "The Internet Key Exchange + (IKE)", RFC 2409, November 1998. + + [RFC2474] Nichols, K., Blake, S., Baker, F. and D. Black, + "Definition of the Differentiated Services Field (DS + Field) in the IPv4 and IPv6 Headers", RFC 2474, December + 1998. + + [RFC2475] Blake, S., Black, D., Carlson, M., Davies, E., Wang, Z. + and W. Weiss, "An Architecture for Differentiated + Services", RFC 2475, December 1998. + + [RFC2481] Ramakrishnan K. and S. Floyd, "A Proposal to add + Explicit Congestion Notification (ECN) to IP", RFC 2481, + January 1999. + + [RFC2581] Alman, M., Paxson, V. and W. Stevens, "TCP Congestion + Control", RFC 2581, April 1999. + + [RFC2884] Hadi Salim, J. and U. Ahmed, "Performance Evaluation of + Explicit Congestion Notification (ECN) in IP Networks", + RFC 2884, July 2000. + + [RFC2983] Black, D., "Differentiated Services and Tunnels", + RFC2983, October 2000. + + [RFC2780] Bradner S. and V. Paxson, "IANA Allocation Guidelines + For Values In the Internet Protocol and Related + Headers", BCP 37, RFC 2780, March 2000. + + [RJ90] K. K. Ramakrishnan and Raj Jain, "A Binary Feedback + Scheme for Congestion Avoidance in Computer Networks", + ACM Transactions on Computer Systems, Vol.8, No.2, pp. + 158-181, May 1990. + + [SCWA99] Stefan Savage, Neal Cardwell, David Wetherall, and Tom + Anderson, TCP Congestion Control with a Misbehaving + Receiver, ACM Computer Communications Review, October + 1999. + + + + + +Ramakrishnan, et al. Standards Track [Page 44] + +RFC 3168 The Addition of ECN to IP September 2001 + + + [TBIT] Jitendra Padhye and Sally Floyd, "Identifying the TCP + Behavior of Web Servers", ICSI TR-01-002, February 2001. + URL "http://www.aciri.org/tbit/". + +16. Security Considerations + + Security considerations have been discussed in Sections 7, 8, 18, and + 19. + +17. IPv4 Header Checksum Recalculation + + IPv4 header checksum recalculation is an issue with some high-end + router architectures using an output-buffered switch, since most if + not all of the header manipulation is performed on the input side of + the switch, while the ECN decision would need to be made local to the + output buffer. This is not an issue for IPv6, since there is no IPv6 + header checksum. The IPv4 TOS octet is the last byte of a 16-bit + half-word. + + RFC 1141 [RFC1141] discusses the incremental updating of the IPv4 + checksum after the TTL field is decremented. The incremental + updating of the IPv4 checksum after the CE codepoint was set would + work as follows: Let HC be the original header checksum for an ECT(0) + packet, and let HC' be the new header checksum after the CE bit has + been set. That is, the ECN field has changed from '10' to '11'. + Then for header checksums calculated with one's complement + subtraction, HC' would be recalculated as follows: + + HC' = { HC - 1 HC > 1 + { 0x0000 HC = 1 + + For header checksums calculated on two's complement machines, HC' + would be recalculated as follows after the CE bit was set: + + HC' = { HC - 1 HC > 0 + { 0xFFFE HC = 0 + + A similar incremental updating of the IPv4 checksum can be carried + out when the ECN field is changed from ECT(1) to CE, that is, from ' + 01' to '11'. + +18. Possible Changes to the ECN Field in the Network + + This section discusses in detail possible changes to the ECN field in + the network, such as falsely reporting congestion, disabling ECN- + Capability for an individual packet, erasing the ECN congestion + indication, or falsely indicating ECN-Capability. + + + + +Ramakrishnan, et al. Standards Track [Page 45] + +RFC 3168 The Addition of ECN to IP September 2001 + + +18.1. Possible Changes to the IP Header + +18.1.1. Erasing the Congestion Indication + + First, we consider the changes that a router could make that would + result in effectively erasing the congestion indication after it had + been set by a router upstream. The convention followed is: ECN + codepoint of received packet -> ECN codepoint of packet transmitted. + + Replacing the CE codepoint with the ECT(0) or ECT(1) codepoint + effectively erases the congestion indication. However, with the use + of two ECT codepoints, a router erasing the CE codepoint has no way + to know whether the original ECT codepoint was ECT(0) or ECT(1). + Thus, it is possible for the transport protocol to deploy mechanisms + to detect such erasures of the CE codepoint. + + The consequence of the erasure of the CE codepoint for the upstream + router is that there is a potential for congestion to build for a + time, because the congestion indication does not reach the source. + However, the packet would be received and acknowledged. + + The potential effect of erasing the congestion indication is complex, + and is discussed in depth in Section 19 below. Note that the effect + of erasing the congestion indication is different from dropping a + packet in the network. When a data packet is dropped, the drop is + detected by the TCP sender, and interpreted as an indication of + congestion. Similarly, if a sufficient number of consecutive + acknowledgement packets are dropped, causing the cumulative + acknowledgement field not to be advanced at the sender, the sender is + limited by the congestion window from sending additional packets, and + ultimately the retransmit timer expires. + + In contrast, a systematic erasure of the CE bit by a downstream + router can have the effect of causing a queue buildup at an upstream + router, including the possible loss of packets due to buffer + overflow. There is a potential of unfairness in that another flow + that goes through the congested router could react to the CE bit set + while the flow that has the CE bit erased could see better + performance. The limitations on this potential unfairness are + discussed in more detail in Section 19 below. + + The last of the three changes is to replace the CE codepoint with the + not-ECT codepoint, thus erasing the congestion indication and + disabling ECN-Capability at the same time. + + The `erasure' of the congestion indication is only effective if the + packet does not end up being marked or dropped again by a downstream + router. If the CE codepoint is replaced by an ECT codepoint, the + + + +Ramakrishnan, et al. Standards Track [Page 46] + +RFC 3168 The Addition of ECN to IP September 2001 + + + packet remains ECN-Capable, and could be either marked or dropped by + a downstream router as an indication of congestion. If the CE + codepoint is replaced by the not-ECT codepoint, the packet is no + longer ECN-capable, and can therefore be dropped but not marked by a + downstream router as an indication of congestion. + +18.1.2. Falsely Reporting Congestion + + This change is to set the CE codepoint when an ECT codepoint was + already set, even though there was no congestion. This change does + not affect the treatment of that packet along the rest of the path. + In particular, a router does not examine the CE codepoint in deciding + whether to drop or mark an arriving packet. + + However, this could result in the application unnecessarily invoking + end-to-end congestion control, and reducing its arrival rate. By + itself, this is no worse (for the application or for the network) + than if the tampering router had actually dropped the packet. + +18.1.3. Disabling ECN-Capability + + This change is to turn off the ECT codepoint of a packet. This means + that if the packet later encounters congestion (e.g., by arriving to + a RED queue with a moderate average queue size), it will be dropped + instead of being marked. By itself, this is no worse (for the + application) than if the tampering router had actually dropped the + packet. The saving grace in this particular case is that there is no + congested router upstream expecting a reaction from setting the CE + bit. + +18.1.4. Falsely Indicating ECN-Capability + + This change would incorrectly label a packet as ECN-Capable. The + packet may have been sent either by an ECN-Capable transport or a + transport that is not ECN-Capable. + + If the packet later encounters moderate congestion at an ECN-Capable + router, the router could set the CE codepoint instead of dropping the + packet. If the transport protocol in fact is not ECN-Capable, then + the transport will never receive this indication of congestion, and + will not reduce its sending rate in response. The potential + consequences of falsely indicating ECN-capability are discussed + further in Section 19 below. + + If the packet never later encounters congestion at an ECN-Capable + router, then the first of these two changes would have no effect, + other than possibly interfering with the use of the ECN nonce by the + transport protocol. The last change, however, would have the effect + + + +Ramakrishnan, et al. Standards Track [Page 47] + +RFC 3168 The Addition of ECN to IP September 2001 + + + of giving false reports of congestion to a monitoring device along + the path. If the transport protocol is ECN-Capable, then this change + could also have an effect at the transport level, by combining + falsely indicating ECN-Capability with falsely reporting congestion. + For an ECN-capable transport, this would cause the transport to + unnecessarily react to congestion. In this particular case, the + router that is incorrectly changing the ECN field could have dropped + the packet. Thus for this case of an ECN-capable transport, the + consequence of this change to the ECN field is no worse than dropping + the packet. + +18.2. Information carried in the Transport Header + + For TCP, an ECN-capable TCP receiver informs its TCP peer that it is + ECN-capable at the TCP level, conveying this information in the TCP + header at the time the connection is setup. This document does not + consider potential dangers introduced by changes in the transport + header within the network. We note that when IPsec is used, the + transport header is protected both in tunnel and transport modes + [ESP, AH]. + + Another issue concerns TCP packets with a spoofed IP source address + carrying invalid ECN information in the transport header. For + completeness, we examine here some possible ways that a node spoofing + the IP source address of another node could use the two ECN flags in + the TCP header to launch a denial-of-service attack. However, these + attacks would require an ability for the attacker to use valid TCP + sequence numbers, and any attacker with this ability and with the + ability to spoof IP source addresses could damage the TCP connection + without using the ECN flags. Therefore, ECN does not add any new + vulnerabilities in this respect. + + An acknowledgement packet with a spoofed IP source address of the TCP + data receiver could include the ECE bit set. If accepted by the TCP + data sender as a valid packet, this spoofed acknowledgement packet + could result in the TCP data sender unnecessarily halving its + congestion window. However, to be accepted by the data sender, such + a spoofed acknowledgement packet would have to have the correct 32- + bit sequence number as well as a valid acknowledgement number. An + attacker that could successfully send such a spoofed acknowledgement + packet could also send a spoofed RST packet, or do other equally + damaging operations to the TCP connection. + + Packets with a spoofed IP source address of the TCP data sender could + include the CWR bit set. Again, to be accepted, such a packet would + have to have a valid sequence number. In addition, such a spoofed + packet would have a limited performance impact. Spoofing a data + packet with the CWR bit set could result in the TCP data receiver + + + +Ramakrishnan, et al. Standards Track [Page 48] + +RFC 3168 The Addition of ECN to IP September 2001 + + + sending fewer ECE packets than it would otherwise, if the data + receiver was sending ECE packets when it received the spoofed CWR + packet. + +18.3. Split Paths + + In some cases, a malicious or broken router might have access to only + a subset of the packets from a flow. The question is as follows: + can this router, by altering the ECN field in this subset of the + packets, do more damage to that flow than if it had simply dropped + that set of packets? + + We will classify the packets in the flow as A packets and B packets, + and assume that the adversary only has access to A packets. Assume + that the adversary is subverting end-to-end congestion control along + the path traveled by A packets only, by either falsely indicating + ECN-Capability upstream of the point where congestion occurs, or + erasing the congestion indication downstream. Consider also that + there exists a monitoring device that sees both the A and B packets, + and will "punish" both the A and B packets if the total flow is + determined not to be properly responding to indications of + congestion. Another key characteristic that we believe is likely to + be true is that the monitoring device, before `punishing' the A&B + flow, will first drop packets instead of setting the CE codepoint, + and will drop arriving packets of that flow that already have the CE + codepoint set. If the end nodes are in fact using end-to-end + congestion control, they will see all of the indications of + congestion seen by the monitoring device, and will begin to respond + to these indications of congestion. Thus, the monitoring device is + successful in providing the indications to the flow at an early + stage. + + It is true that the adversary that has access only to the A packets + might, by subverting ECN-based congestion control, be able to deny + the benefits of ECN to the other packets in the A&B aggregate. While + this is unfortunate, this is not a reason to disable ECN. + + A variant of falsely reporting congestion occurs when there are two + adversaries along a path, where the first adversary falsely reports + congestion, and the second adversary `erases' those reports. (Unlike + packet drops, ECN congestion reports can be `reversed' later in the + network by a malicious or broken router. However, the use of the ECN + nonce could help the transport to detect this behavior.) While this + would be transparent to the end node, it is possible that a + monitoring device between the first and second adversaries would see + the false indications of congestion. Keep in mind our recommendation + in this document, that before `punishing' a flow for not responding + appropriately to congestion, the router will first switch to dropping + + + +Ramakrishnan, et al. Standards Track [Page 49] + +RFC 3168 The Addition of ECN to IP September 2001 + + + rather than marking as an indication of congestion, for that flow. + When this includes dropping arriving packets from that flow that have + the CE codepoint set, this ensures that these indications of + congestion are being seen by the end nodes. Thus, there is no + additional harm that we are able to postulate as a result of multiple + conflicting adversaries. + +19. Implications of Subverting End-to-End Congestion Control + + This section focuses on the potential repercussions of subverting + end-to-end congestion control by either falsely indicating ECN- + Capability, or by erasing the congestion indication in ECN (the CE + codepoint). Subverting end-to-end congestion control by either of + these two methods can have consequences both for the application and + for the network. We discuss these separately below. + + The first method to subvert end-to-end congestion control, that of + falsely indicating ECN-Capability, effectively subverts end-to-end + congestion control only if the packet later encounters congestion + that results in the setting of the CE codepoint. In this case, the + transport protocol (which may not be ECN-capable) does not receive + the indication of congestion from these downstream congested routers. + + The second method to subvert end-to-end congestion control, `erasing' + the CE codepoint in a packet, effectively subverts end-to-end + congestion control only when the CE codepoint in the packet was set + earlier by a congested router. In this case, the transport protocol + does not receive the indication of congestion from the upstream + congested routers. + + Either of these two methods of subverting end-to-end congestion + control can potentially introduce more damage to the network (and + possibly to the flow itself) than if the adversary had simply dropped + packets from that flow. However, as we discuss later in this section + and in Section 7, this potential damage is limited. + +19.1. Implications for the Network and for Competing Flows + + The CE codepoint of the ECN field is only used by routers as an + indication of congestion during periods of *moderate* congestion. + ECN-capable routers should drop rather than mark packets during heavy + congestion even if the router's queue is not yet full. For example, + for routers using active queue management based on RED, the router + should drop rather than mark packets that arrive while the average + queue sizes exceed the RED queue's maximum threshold. + + + + + + +Ramakrishnan, et al. Standards Track [Page 50] + +RFC 3168 The Addition of ECN to IP September 2001 + + + One consequence for the network of subverting end-to-end congestion + control is that flows that do not receive the congestion indications + from the network might increase their sending rate until they drive + the network into heavier congestion. Then, the congested router + could begin to drop rather than mark arriving packets. For flows + that are not isolated by some form of per-flow scheduling or other + per-flow mechanisms, but are instead aggregated with other flows in a + single queue in an undifferentiated fashion, this packet-dropping at + the congested router would apply to all flows that share that queue. + Thus, the consequences would be to increase the level of congestion + in the network. + + In some cases, the increase in the level of congestion will lead to a + substantial buffer buildup at the congested queue that will be + sufficient to drive the congested queue from the packet-marking to + the packet-dropping regime. This transition could occur either + because of buffer overflow, or because of the active queue management + policy described above that drops packets when the average queue is + above RED's maximum threshold. At this point, all flows, including + the subverted flow, will begin to see packet drops instead of packet + marks, and a malicious or broken router will no longer be able to ` + erase' these indications of congestion in the network. If the end + nodes are deploying appropriate end-to-end congestion control, then + the subverted flow will reduce its arrival rate in response to + congestion. When the level of congestion is sufficiently reduced, + the congested queue can return from the packet-dropping regime to the + packet-marking regime. The steady-state pattern could be one of the + congested queue oscillating between these two regimes. + + In other cases, the consequences of subverting end-to-end congestion + control will not be severe enough to drive the congested link into + sufficiently-heavy congestion that packets are dropped instead of + being marked. In this case, the implications for competing flows in + the network will be a slightly-increased rate of packet marking or + dropping, and a corresponding decrease in the bandwidth available to + those flows. This can be a stable state if the arrival rate of the + subverted flow is sufficiently small, relative to the link bandwidth, + that the average queue size at the congested router remains under + control. In particular, the subverted flow could have a limited + bandwidth demand on the link at this router, while still getting more + than its "fair" share of the link. This limited demand could be due + to a limited demand from the data source; a limitation from the TCP + advertised window; a lower-bandwidth access pipe; or other factors. + Thus the subversion of ECN-based congestion control can still lead to + unfairness, which we believe is appropriate to note here. + + + + + + +Ramakrishnan, et al. Standards Track [Page 51] + +RFC 3168 The Addition of ECN to IP September 2001 + + + The threat to the network posed by the subversion of ECN-based + congestion control in the network is essentially the same as the + threat posed by an end-system that intentionally fails to cooperate + with end-to-end congestion control. The deployment of mechanisms in + routers to address this threat is an open research question, and is + discussed further in Section 10. + + Let us take the example described in Section 18.1.1, where the CE + codepoint that was set in a packet is erased: {'11' -> '10' or '11' + -> '01'}. The consequence for the congested upstream router that set + the CE codepoint is that this congestion indication does not reach + the end nodes for that flow. The source (even one which is completely + cooperative and not malicious) is thus allowed to continue to + increase its sending rate (if it is a TCP flow, by increasing its + congestion window). The flow potentially achieves better throughput + than the other flows that also share the congested router, especially + if there are no policing mechanisms or per-flow queuing mechanisms at + that router. Consider the behavior of the other flows, especially if + they are cooperative: that is, the flows that do not experience + subverted end-to-end congestion control. They are likely to reduce + their load (e.g., by reducing their window size) on the congested + router, thus benefiting our subverted flow. This results in + unfairness. As we discussed above, this unfairness could either be + transient (because the congested queue is driven into the packet- + marking regime), oscillatory (because the congested queue oscillates + between the packet marking and the packet dropping regime), or more + moderate but a persistent stable state (because the congested queue + is never driven to the packet dropping regime). + + The results would be similar if the subverted flow was intentionally + avoiding end-to-end congestion control. One difference is that a + flow that is intentionally avoiding end-to-end congestion control at + the end nodes can avoid end-to-end congestion control even when the + congested queue is in packet-dropping mode, by refusing to reduce its + sending rate in response to packet drops in the network. Thus the + problems for the network from the subversion of ECN-based congestion + control are less severe than the problems caused by the intentional + avoidance of end-to-end congestion control in the end nodes. It is + also the case that it is considerably more difficult to control the + behavior of the end nodes than it is to control the behavior of the + infrastructure itself. This is not to say that the problems for the + network posed by the network's subversion of ECN-based congestion + control are small; just that they are dwarfed by the problems for the + network posed by the subversion of either ECN-based or other + currently known packet-based congestion control mechanisms by the end + nodes. + + + + + +Ramakrishnan, et al. Standards Track [Page 52] + +RFC 3168 The Addition of ECN to IP September 2001 + + +19.2. Implications for the Subverted Flow + + When a source indicates that it is ECN-capable, there is an + expectation that the routers in the network that are capable of + participating in ECN will use the CE codepoint for indication of + congestion. There is the potential benefit of using ECN in reducing + the amount of packet loss (in addition to the reduced queuing delays + because of active queue management policies). When the packet flows + through an IPsec tunnel where the nodes that the tunneled packets + traverse are untrusted in some way, the expectation is that IPsec + will protect the flow from subversion that results in undesirable + consequences. + + In many cases, a subverted flow will benefit from the subversion of + end-to-end congestion control for that flow in the network, by + receiving more bandwidth than it would have otherwise, relative to + competing non-subverted flows. If the congested queue reaches the + packet-dropping stage, then the subversion of end-to-end congestion + control might or might not be of overall benefit to the subverted + flow, depending on that flow's relative tradeoffs between throughput, + loss, and delay. + + One form of subverting end-to-end congestion control is to falsely + indicate ECN-capability by setting the ECT codepoint. This has the + consequence of downstream congested routers setting the CE codepoint + in vain. However, as described in Section 9.1.2, if an ECT codepoint + is changed in an IP tunnel, this can be detected at the egress point + of the tunnel, as long as the inner header was not changed within the + tunnel. + + The second form of subverting end-to-end congestion control is to + erase the congestion indication by erasing the CE codepoint. In this + case, it is the upstream congested routers that set the CE codepoint + in vain. + + If an ECT codepoint is erased within an IP tunnel, then this can be + detected at the egress point of the tunnel, as long as the inner + header was not changed within the tunnel. If the CE codepoint is set + upstream of the IP tunnel, then any erasure of the outer header's CE + codepoint within the tunnel will have no effect because the inner + header preserves the set value of the CE codepoint. However, if the + CE codepoint is set within the tunnel, and erased either within or + downstream of the tunnel, this is not necessarily detected at the + egress point of the tunnel. + + With this subversion of end-to-end congestion control, an end-system + transport does not respond to the congestion indication. Along with + the increased unfairness for the non-subverted flows described in the + + + +Ramakrishnan, et al. Standards Track [Page 53] + +RFC 3168 The Addition of ECN to IP September 2001 + + + previous section, the congested router's queue could continue to + build, resulting in packet loss at the congested router - which is a + means for indicating congestion to the transport in any case. In the + interim, the flow might experience higher queuing delays, possibly + along with an increased bandwidth relative to other non-subverted + flows. But transports do not inherently make assumptions of + consistently experiencing carefully managed queuing in the path. We + believe that these forms of subverting end-to-end congestion control + are no worse for the subverted flow than if the adversary had simply + dropped the packets of that flow itself. + +19.3. Non-ECN-Based Methods of Subverting End-to-end Congestion Control + + We have shown that, in many cases, a malicious or broken router that + is able to change the bits in the ECN field can do no more damage + than if it had simply dropped the packet in question. However, this + is not true in all cases, in particular in the cases where the broken + router subverted end-to-end congestion control by either falsely + indicating ECN-Capability or by erasing the ECN congestion indication + (in the CE codepoint). While there are many ways that a router can + harm a flow by dropping packets, a router cannot subvert end-to-end + congestion control by dropping packets. As an example, a router + cannot subvert TCP congestion control by dropping data packets, + acknowledgement packets, or control packets. + + Even though packet-dropping cannot be used to subvert end-to-end + congestion control, there *are* non-ECN-based methods for subverting + end-to-end congestion control that a broken or malicious router could + use. For example, a broken router could duplicate data packets, thus + effectively negating the effects of end-to-end congestion control + along some portion of the path. (For a router that duplicated + packets within an IPsec tunnel, the security administrator can cause + the duplicate packets to be discarded by configuring anti-replay + protection for the tunnel.) This duplication of packets within the + network would have similar implications for the network and for the + subverted flow as those described in Sections 18.1.1 and 18.1.4 + above. + +20. The Motivation for the ECT Codepoints. + +20.1. The Motivation for an ECT Codepoint. + + The need for an ECT codepoint is motivated by the fact that ECN will + be deployed incrementally in an Internet where some transport + protocols and routers understand ECN and some do not. With an ECT + codepoint, the router can drop packets from flows that are not ECN- + capable, but can *instead* set the CE codepoint in packets that *are* + + + + +Ramakrishnan, et al. Standards Track [Page 54] + +RFC 3168 The Addition of ECN to IP September 2001 + + + ECN-capable. Because an ECT codepoint allows an end node to have the + CE codepoint set in a packet *instead* of having the packet dropped, + an end node might have some incentive to deploy ECN. + + If there was no ECT codepoint, then the router would have to set the + CE codepoint for packets from both ECN-capable and non-ECN-capable + flows. In this case, there would be no incentive for end-nodes to + deploy ECN, and no viable path of incremental deployment from a non- + ECN world to an ECN-capable world. Consider the first stages of such + an incremental deployment, where a subset of the flows are ECN- + capable. At the onset of congestion, when the packet + dropping/marking rate would be low, routers would only set CE + codepoints, rather than dropping packets. However, only those flows + that are ECN-capable would understand and respond to CE packets. The + result is that the ECN-capable flows would back off, and the non- + ECN-capable flows would be unaware of the ECN signals and would + continue to open their congestion windows. + + In this case, there are two possible outcomes: (1) the ECN-capable + flows back off, the non-ECN-capable flows get all of the bandwidth, + and congestion remains mild, or (2) the ECN-capable flows back off, + the non-ECN-capable flows don't, and congestion increases until the + router transitions from setting the CE codepoint to dropping packets. + While this second outcome evens out the fairness, the ECN-capable + flows would still receive little benefit from being ECN-capable, + because the increased congestion would drive the router to packet- + dropping behavior. + + A flow that advertised itself as ECN-Capable but does not respond to + CE codepoints is functionally equivalent to a flow that turns off + congestion control, as discussed earlier in this document. + + Thus, in a world when a subset of the flows are ECN-capable, but + where ECN-capable flows have no mechanism for indicating that fact to + the routers, there would be less effective and less fair congestion + control in the Internet, resulting in a strong incentive for end + nodes not to deploy ECN. + +20.2. The Motivation for two ECT Codepoints. + + The primary motivation for the two ECT codepoints is to provide a + one-bit ECN nonce. The ECN nonce allows the development of + mechanisms for the sender to probabilistically verify that network + elements are not erasing the CE codepoint, and that data receivers + are properly reporting to the sender the receipt of packets with the + CE codepoint set. + + + + + +Ramakrishnan, et al. Standards Track [Page 55] + +RFC 3168 The Addition of ECN to IP September 2001 + + + Another possibility for senders to detect misbehaving network + elements or receivers would be for the data sender to occasionally + send a data packet with the CE codepoint set, to see if the receiver + reports receiving the CE codepoint. Of course, if these packets + encountered congestion in the network, the router might make no + change in the packets, because the CE codepoint would already be set. + Thus, for packets sent with the CE codepoint set, the TCP end-nodes + could not determine if some router intended to set the CE codepoint + in these packets. For this reason, sending packets with the CE + codepoint would have to be done sparingly, and would be a less + effective check against misbehaving network elements and receivers + than would be the ECN nonce. + + The assignment of the fourth ECN codepoint to ECT(1) precludes the + use of this codepoint for some other purposes. For clarity, we + briefly list other possible purposes here. + + One possibility might have been for the data sender to use the fourth + ECN codepoint to indicate an alternate semantics for ECN. However, + this seems to us more appropriate to be signaled using a + differentiated services codepoint in the DS field. + + A second possible use for the fourth ECN codepoint would have been to + give the router two separate codepoints for the indication of + congestion, CE(0) and CE(1), for mild and severe congestion + respectively. While this could be useful in some cases, this + certainly does not seem a compelling requirement at this point. If + there was judged to be a compelling need for this, the complications + of incremental deployment would most likely necessitate more that + just one codepoint for this function. + + A third use that has been informally proposed for the ECN codepoint + is for use in some forms of multicast congestion control, based on + randomized procedures for duplicating marked packets at routers. + Some proposed multicast packet duplication procedures are based on a + new ECN codepoint that (1) conveys the fact that congestion occurred + upstream of the duplication point that marked the packet with this + codepoint and (2) can detect congestion downstream of that + duplication point. ECT(1) can serve this purpose because it is both + distinct from ECT(0) and is replaced by CE when ECN marking occurs in + response to congestion or incipient congestion. Explanation of how + this enhanced version of ECN would be used by multicast congestion + control is beyond the scope of this document, as are ECN-aware + multicast packet duplication procedures and the processing of the ECN + field at multicast receivers in all cases (i.e., irrespective of the + multicast packet duplication procedure(s) used). + + + + + +Ramakrishnan, et al. Standards Track [Page 56] + +RFC 3168 The Addition of ECN to IP September 2001 + + + The specification of IP tunnel modifications for ECN in this document + assumes that the only change made to the outer IP header's ECN field + between tunnel endpoints is to set the CE codepoint to indicate + congestion. This is not consistent with some of the proposed uses of + ECT(1) by the multicast duplication procedures in the previous + paragraph, and such procedures SHOULD NOT be deployed unless this + inconsistency between multicast duplication procedures and IP tunnels + with full ECN functionality is resolved. Limited ECN functionality + may be used instead, although in practice many tunnel protocols + (including IPsec) will not work correctly if multicast traffic + duplication occurs within the tunnel + +21. Why use Two Bits in the IP Header? + + Given the need for an ECT indication in the IP header, there still + remains the question of whether the ECT (ECN-Capable Transport) and + CE (Congestion Experienced) codepoints should have been overloaded on + a single bit. This overloaded-one-bit alternative, explored in + [Floyd94], would have involved a single bit with two values. One + value, "ECT and not CE", would represent an ECN-Capable Transport, + and the other value, "CE or not ECT", would represent either + Congestion Experienced or a non-ECN-Capable transport. + + One difference between the one-bit and two-bit implementations + concerns packets that traverse multiple congested routers. Consider + a CE packet that arrives at a second congested router, and is + selected by the active queue management at that router for either + marking or dropping. In the one-bit implementation, the second + congested router has no choice but to drop the CE packet, because it + cannot distinguish between a CE packet and a non-ECT packet. In the + two-bit implementation, the second congested router has the choice of + either dropping the CE packet, or of leaving it alone with the CE + codepoint set. + + Another difference between the one-bit and two-bit implementations + comes from the fact that with the one-bit implementation, receivers + in a single flow cannot distinguish between CE and non-ECT packets. + Thus, in the one-bit implementation an ECN-capable data sender would + have to unambiguously indicate to the receiver or receivers whether + each packet had been sent as ECN-Capable or as non-ECN-Capable. One + possibility would be for the sender to indicate in the transport + header whether the packet was sent as ECN-Capable. A second + possibility that would involve a functional limitation for the one- + bit implementation would be for the sender to unambiguously indicate + that it was going to send *all* of its packets as ECN-Capable or as + non-ECN-Capable. For a multicast transport protocol, this + unambiguous indication would have to be apparent to receivers joining + an on-going multicast session. + + + +Ramakrishnan, et al. Standards Track [Page 57] + +RFC 3168 The Addition of ECN to IP September 2001 + + + Another concern that was described earlier (and recommended in this + document) is that transports (particularly TCP) should not mark pure + ACK packets or retransmitted packets as being ECN-Capable. A pure + ACK packet from a non-ECN-capable transport could be dropped, without + necessarily having an impact on the transport from a congestion + control perspective (because subsequent ACKs are cumulative). An + ECN-capable transport reacting to the CE codepoint in a pure ACK + packet by reducing the window would be at a disadvantage in + comparison to a non-ECN-capable transport. For this reason (and for + reasons described earlier in relation to retransmitted packets), it + is desirable to have the ECT codepoint set on a per-packet basis. + + Another advantage of the two-bit approach is that it is somewhat more + robust. The most critical issue, discussed in Section 8, is that the + default indication should be that of a non-ECN-Capable transport. In + a two-bit implementation, this requirement for the default value + simply means that the not-ECT codepoint should be the default. In + the one-bit implementation, this means that the single overloaded bit + should by default be in the "CE or not ECT" position. This is less + clear and straightforward, and possibly more open to incorrect + implementations either in the end nodes or in the routers. + + In summary, while the one-bit implementation could be a possible + implementation, it has the following significant limitations relative + to the two-bit implementation. First, the one-bit implementation has + more limited functionality for the treatment of CE packets at a + second congested router. Second, the one-bit implementation requires + either that extra information be carried in the transport header of + packets from ECN-Capable flows (to convey the functionality of the + second bit elsewhere, namely in the transport header), or that + senders in ECN-Capable flows accept the limitation that receivers + must be able to determine a priori which packets are ECN-Capable and + which are not ECN-Capable. Third, the one-bit implementation is + possibly more open to errors from faulty implementations that choose + the wrong default value for the ECN bit. We believe that the use of + the extra bit in the IP header for the ECT-bit is extremely valuable + to overcome these limitations. + +22. Historical Definitions for the IPv4 TOS Octet + + RFC 791 [RFC791] defined the ToS (Type of Service) octet in the IP + header. In RFC 791, bits 6 and 7 of the ToS octet are listed as + "Reserved for Future Use", and are shown set to zero. The first two + fields of the ToS octet were defined as the Precedence and Type of + Service (TOS) fields. + + + + + + +Ramakrishnan, et al. Standards Track [Page 58] + +RFC 3168 The Addition of ECN to IP September 2001 + + + 0 1 2 3 4 5 6 7 + +-----+-----+-----+-----+-----+-----+-----+-----+ + | PRECEDENCE | TOS | 0 | 0 | RFC 791 + +-----+-----+-----+-----+-----+-----+-----+-----+ + + RFC 1122 included bits 6 and 7 in the TOS field, though it did not + discuss any specific use for those two bits: + + 0 1 2 3 4 5 6 7 + +-----+-----+-----+-----+-----+-----+-----+-----+ + | PRECEDENCE | TOS | RFC 1122 + +-----+-----+-----+-----+-----+-----+-----+-----+ + + The IPv4 TOS octet was redefined in RFC 1349 [RFC1349] as follows: + + 0 1 2 3 4 5 6 7 + +-----+-----+-----+-----+-----+-----+-----+-----+ + | PRECEDENCE | TOS | MBZ | RFC 1349 + +-----+-----+-----+-----+-----+-----+-----+-----+ + + Bit 6 in the TOS field was defined in RFC 1349 for "Minimize Monetary + Cost". In addition to the Precedence and Type of Service (TOS) + fields, the last field, MBZ (for "must be zero") was defined as + currently unused. RFC 1349 stated that "The originator of a datagram + sets [the MBZ] field to zero (unless participating in an Internet + protocol experiment which makes use of that bit)." + + RFC 1455 [RFC 1455] defined an experimental standard that used all + four bits in the TOS field to request a guaranteed level of link + security. + + RFC 1349 and RFC 1455 have been obsoleted by "Definition of the + Differentiated Services Field (DS Field) in the IPv4 and IPv6 + Headers" [RFC2474] in which bits 6 and 7 of the DS field are listed + as Currently Unused (CU). RFC 2780 [RFC2780] specified ECN as an + experimental use of the two-bit CU field. RFC 2780 updated the + definition of the DS Field to only encompass the first six bits of + this octet rather than all eight bits; these first six bits are + defined as the Differentiated Services CodePoint (DSCP): + + 0 1 2 3 4 5 6 7 + +-----+-----+-----+-----+-----+-----+-----+-----+ + | DSCP | CU | RFCs 2474, + +-----+-----+-----+-----+-----+-----+-----+-----+ 2780 + + Because of this unstable history, the definition of the ECN field in + this document cannot be guaranteed to be backwards compatible with + all past uses of these two bits. + + + +Ramakrishnan, et al. Standards Track [Page 59] + +RFC 3168 The Addition of ECN to IP September 2001 + + + Prior to RFC 2474, routers were not permitted to modify bits in + either the DSCP or ECN field of packets forwarded through them, and + hence routers that comply only with RFCs prior to 2474 should have no + effect on ECN. For end nodes, bit 7 (the second ECN bit) must be + transmitted as zero for any implementation compliant only with RFCs + prior to 2474. Such nodes may transmit bit 6 (the first ECN bit) as + one for the "Minimize Monetary Cost" provision of RFC 1349 or the + experiment authorized by RFC 1455; neither this aspect of RFC 1349 + nor the experiment in RFC 1455 were widely implemented or used. The + damage that could be done by a broken, non-conformant router would + include "erasing" the CE codepoint for an ECN-capable packet that + arrived at the router with the CE codepoint set, or setting the CE + codepoint even in the absence of congestion. This has been discussed + in the section on "Non-compliance in the Network". + + The damage that could be done in an ECN-capable environment by a + non-ECN-capable end-node transmitting packets with the ECT codepoint + set has been discussed in the section on "Non-compliance by the End + Nodes". + +23. IANA Considerations + + This section contains the namespaces that have either been created in + this specification, or the values assigned in existing namespaces + managed by IANA. + +23.1. IPv4 TOS Byte and IPv6 Traffic Class Octet + + The codepoints for the ECN Field of the IP header are specified by + the Standards Action of this RFC, as is required by RFC 2780. + + When this document is published as an RFC, IANA should create a new + registry, "IPv4 TOS Byte and IPv6 Traffic Class Octet", with the + namespace as follows: + + IPv4 TOS Byte and IPv6 Traffic Class Octet + + Description: The registrations are identical for IPv4 and IPv6. + + Bits 0-5: see Differentiated Services Field Codepoints Registry + (http://www.iana.org/assignments/dscp-registry) + + + + + + + + + + +Ramakrishnan, et al. Standards Track [Page 60] + +RFC 3168 The Addition of ECN to IP September 2001 + + + Bits 6-7, ECN Field: + + Binary Keyword References + ------ ------- ---------- + 00 Not-ECT (Not ECN-Capable Transport) [RFC 3168] + 01 ECT(1) (ECN-Capable Transport(1)) [RFC 3168] + 10 ECT(0) (ECN-Capable Transport(0)) [RFC 3168] + 11 CE (Congestion Experienced) [RFC 3168] + +23.2. TCP Header Flags + + The codepoints for the CWR and ECE flags in the TCP header are + specified by the Standards Action of this RFC, as is required by RFC + 2780. + + When this document is published as an RFC, IANA should create a new + registry, "TCP Header Flags", with the namespace as follows: + + TCP Header Flags + + The Transmission Control Protocol (TCP) included a 6-bit Reserved + field defined in RFC 793, reserved for future use, in bytes 13 and 14 + of the TCP header, as illustrated below. The other six Control bits + are defined separately by RFC 793. + + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | | | U | A | P | R | S | F | + | Header Length | Reserved | R | C | S | S | Y | I | + | | | G | K | H | T | N | N | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + + RFC 3168 defines two of the six bits from the Reserved field to be + used for ECN, as follows: + + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | | | C | E | U | A | P | R | S | F | + | Header Length | Reserved | W | C | R | C | S | S | Y | I | + | | | R | E | G | K | H | T | N | N | + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + + + + + + + + + + +Ramakrishnan, et al. Standards Track [Page 61] + +RFC 3168 The Addition of ECN to IP September 2001 + + + TCP Header Flags + + Bit Name Reference + --- ---- --------- + 8 CWR (Congestion Window Reduced) [RFC 3168] + 9 ECE (ECN-Echo) [RFC 3168] + +23.3. IPSEC Security Association Attributes + + IANA allocated the IPSEC Security Association Attribute value 10 for + the ECN Tunnel use described in Section 9.2.1.2 above at the request + of David Black in November 1999. The IANA has changed the Reference + for this allocation from David Black's request to this RFC. + +24. Authors' Addresses + + K. K. Ramakrishnan + TeraOptic Networks, Inc. + + Phone: +1 (408) 666-8650 + EMail: kk@teraoptic.com + + + Sally Floyd + ACIRI + + Phone: +1 (510) 666-2989 + EMail: floyd@aciri.org + URL: http://www.aciri.org/floyd/ + + + David L. Black + EMC Corporation + 42 South St. + Hopkinton, MA 01748 + + Phone: +1 (508) 435-1000 x75140 + EMail: black_david@emc.com + + + + + + + + + + + + + +Ramakrishnan, et al. Standards Track [Page 62] + +RFC 3168 The Addition of ECN to IP September 2001 + + +25. Full Copyright Statement + + Copyright (C) The Internet Society (2001). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + + + + + + + + + + + +Ramakrishnan, et al. Standards Track [Page 63] + diff --git a/ext/picotcp/RFC/rfc3449.txt b/ext/picotcp/RFC/rfc3449.txt new file mode 100644 index 0000000..46936b0 --- /dev/null +++ b/ext/picotcp/RFC/rfc3449.txt @@ -0,0 +1,2299 @@ + + + + + + +Network Working Group H. Balakrishnan +Request for Comments: 3449 MIT LCS +BCP: 69 V. N. Padmanabhan +Category: Best Current Practice Microsoft Research + G. Fairhurst + M. Sooriyabandara + University of Aberdeen, U.K. + December 2002 + + + TCP Performance Implications + of Network Path Asymmetry + +Status of this Memo + + This document specifies an Internet Best Current Practices for the + Internet Community, and requests discussion and suggestions for + improvements. Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2002). All Rights Reserved. + +Abstract + + This document describes TCP performance problems that arise because + of asymmetric effects. These problems arise in several access + networks, including bandwidth-asymmetric networks and packet radio + subnetworks, for different underlying reasons. However, the end + result on TCP performance is the same in both cases: performance + often degrades significantly because of imperfection and variability + in the ACK feedback from the receiver to the sender. + + The document details several mitigations to these effects, which have + either been proposed or evaluated in the literature, or are currently + deployed in networks. These solutions use a combination of local + link-layer techniques, subnetwork, and end-to-end mechanisms, + consisting of: (i) techniques to manage the channel used for the + upstream bottleneck link carrying the ACKs, typically using header + compression or reducing the frequency of TCP ACKs, (ii) techniques to + handle this reduced ACK frequency to retain the TCP sender's + acknowledgment-triggered self-clocking and (iii) techniques to + schedule the data and ACK packets in the reverse direction to improve + performance in the presence of two-way traffic. Each technique is + described, together with known issues, and recommendations for use. + A summary of the recommendations is provided at the end of the + document. + + + + +Balakrishnan et. al. Best Current Practice [Page 1] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +Table of Contents + + 1. Conventions used in this Document ...............................3 + 2. Motivation ....................................................4 + 2.1 Asymmetry due to Differences in Transmit + and Receive Capacity .........................................4 + 2.2 Asymmetry due to Shared Media in the Reverse Direction .......5 + 2.3 The General Problem ..........................................5 + 3. How does Asymmetry Degrade TCP Performance? .....................5 + 3.1 Asymmetric Capacity ..........................................5 + 3.2 MAC Protocol Interactions ....................................7 + 3.3 Bidirectional Traffic ........................................8 + 3.4 Loss in Asymmetric Network Paths ............................10 + 4. Improving TCP Performance using Host Mitigations ...............10 + 4.1 Modified Delayed ACKs .......................................11 + 4.2 Use of Large MSS ............................................12 + 4.3 ACK Congestion Control ......................................13 + 4.4 Window Prediction Mechanism .................................14 + 4.5 Acknowledgement based on Cwnd Estimation. ...................14 + 4.6 TCP Sender Pacing ...........................................14 + 4.7 TCP Byte Counting ...........................................15 + 4.8 Backpressure ................................................16 + 5. Improving TCP performance using Transparent Modifications ......17 + 5.1 TYPE 0: Header Compression ..................................18 + 5.1.1 TCP Header Compression ..................................18 + 5.1.2 Alternate Robust Header Compression Algorithms ..........19 + 5.2 TYPE 1: Reverse Link Bandwidth Management ...................19 + 5.2.1 ACK Filtering ...........................................20 + 5.2.2 ACK Decimation ..........................................21 + 5.3 TYPE 2: Handling Infrequent ACKs ............................22 + 5.3.1 ACK Reconstruction ......................................23 + 5.3.2 ACK Compaction and Companding ...........................25 + 5.3.3 Mitigating TCP packet bursts generated by + Infrequent ACKs .........................................26 + 5.4 TYPE 3: Upstream Link Scheduling ............................27 + 5.4.1 Per-Flow queuing at the Upstream Bottleneck Link ........27 + 5.4.2 ACKs-first Scheduling ...................................28 + 6. Security Considerations ........................................29 + 7. Summary ........................................................30 + 8. Acknowledgments ................................................32 + 9. References .....................................................32 + 10. IANA Considerations ...........................................37 + Appendix: Examples of Subnetworks Exhibiting Network Path + Asymmetry ...............................................38 + Authors' Addresses ................................................40 + Full Copyright Statement ..........................................41 + + + + + +Balakrishnan et. al. Best Current Practice [Page 2] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +1. Conventions used in this Document + + FORWARD DIRECTION: The dominant direction of data transfer over an + asymmetric network path. It corresponds to the direction with better + characteristics in terms of capacity, latency, error rate, etc. Data + transfer in the forward direction is called "forward transfer". + Packets travelling in the forward direction follow the forward path + through the IP network. + + REVERSE DIRECTION: The direction in which acknowledgments of a + forward TCP transfer flow. Data transfer could also happen in this + direction (and is termed "reverse transfer"), but it is typically + less voluminous than that in the forward direction. The reverse + direction typically exhibits worse characteristics than the forward + direction. Packets travelling in the reverse direction follow the + reverse path through the IP network. + + UPSTREAM LINK: The specific bottleneck link that normally has much + less capability than the corresponding downstream link. Congestion + is not confined to this link alone, and may also occur at any point + along the forward and reverse directions (e.g., due to sharing with + other traffic flows). + + DOWNSTREAM LINK: A link on the forward path, corresponding to the + upstream link. + + ACK: A cumulative TCP acknowledgment [RFC791]. In this document, + this term refers to a TCP segment that carries a cumulative + acknowledgement (ACK), but no data. + + DELAYED ACK FACTOR, d: The number of TCP data segments acknowledged + by a TCP ACK. The minimum value of d is 1, since at most one ACK + should be sent for each data packet [RFC1122, RFC2581]. + + STRETCH ACK: Stretch ACKs are acknowledgements that cover more than 2 + segments of previously unacknowledged data (d>2) [RFC2581]. Stretch + ACKs can occur by design (although this is not standard), due to + implementation bugs [All97b, RFC2525], or due to ACK loss [RFC2760]. + + NORMALIZED BANDWIDTH RATIO, k: The ratio of the raw bandwidth + (capacity) of the forward direction to the return direction, divided + by the ratio of the packet sizes used in the two directions [LMS97]. + + SOFTSTATE: Per-flow state established in a network device that is + used by the protocol [Cla88]. The state expires after a period of + time (i.e., is not required to be explicitly deleted when a session + + + + + +Balakrishnan et. al. Best Current Practice [Page 3] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + expires), and is continuously refreshed while a flow continues (i.e., + lost state may be reconstructed without needing to exchange + additional control messages). + +2. Motivation + + Asymmetric characteristics are exhibited by several network + technologies, including cable data networks, (e.g., DOCSIS cable TV + networks [DS00, DS01]), direct broadcast satellite (e.g., an IP + service using Digital Video Broadcast, DVB, [EN97] with an + interactive return channel), Very Small Aperture satellite Terminals + (VSAT), Asymmetric Digital Subscriber Line (ADSL) [ITU02, ANS01], and + several packet radio networks. These networks are increasingly being + deployed as high-speed Internet access networks, and it is therefore + highly desirable to achieve good TCP performance. However, the + asymmetry of the network paths often makes this challenging. + Examples of some networks that exhibit asymmetry are provided in the + Appendix. + + Asymmetry may manifest itself as a difference in transmit and receive + capacity, an imbalance in the packet loss rate, or differences + between the transmit and receive paths [RFC3077]. For example, when + capacity is asymmetric, such that there is reduced capacity on + reverse path used by TCP ACKs, slow or infrequent ACK feedback + degrades TCP performance in the forward direction. Similarly, + asymmetry in the underlying Medium Access Control (MAC) and Physical + (PHY) protocols could make it expensive to transmit TCP ACKs + (disproportionately to their size), even when capacity is symmetric. + +2.1 Asymmetry due to Differences in Transmit and Receive Capacity + + Network paths may be asymmetric because the upstream and downstream + links operate at different rates and/or are implemented using + different technologies. + + The asymmetry in capacity may be substantially increased when best + effort IP flows carrying TCP ACKs share the available upstream + capacity with other traffic flows, e.g., telephony, especially flows + that have reserved upstream capacity. This includes service + guarantees at the IP layer (e.g., the Guaranteed Service [RFC2212]) + or at the subnet layer (e.g., support of Voice over IP [ITU01] using + the Unsolicited Grant service in DOCSIS [DS01], or CBR virtual + connections in ATM over ADSL [ITU02, ANS01]). + + When multiple upstream links exist the asymmetry may be reduced by + dividing upstream traffic between a number of available upstream + links. + + + + +Balakrishnan et. al. Best Current Practice [Page 4] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +2.2 Asymmetry due to Shared Media in the Reverse Direction + + In networks employing centralized multiple access control, asymmetry + may be a fundamental consequence of the hub-and-spokes architecture + of the network (i.e., a single base node communicating with multiple + downstream nodes). The central node often incurs less transmission + overhead and does not incur latency in scheduling its own downstream + transmissions. In contrast, upstream transmission is subject to + additional overhead and latency (e.g., due to guard times between + transmission bursts, and contention intervals). This can produce + significant network path asymmetry. + + Upstream capacity may be further limited by the requirement that each + node must first request per-packet bandwidth using a contention MAC + protocol (e.g., DOCSIS 1.0 MAC restricts each node to sending at most + a single packet in each upstream time-division interval [DS00]). A + satellite network employing dynamic Bandwidth on Demand (BoD), also + consumes MAC resources for each packet sent (e.g., [EN00]). In these + schemes, the available uplink capacity is a function of the MAC + algorithm. The MAC and PHY schemes also introduce overhead per + upstream transmission which could be so significant that transmitting + short packets (including TCP ACKs) becomes as costly as transmitting + MTU-sized data packets. + +2.3 The General Problem + + Despite the technological differences between capacity-dependent and + MAC-dependent asymmetries, both kinds of network path suffer reduced + TCP performance for the same fundamental reason: the imperfection and + variability of ACK feedback. This document discusses the problem in + detail and describes several techniques that may reduce or eliminate + the constraints. + +3. How does Asymmetry Degrade TCP Performance? + + This section describes the implications of network path asymmetry on + TCP performance. The reader is referred to [BPK99, Bal98, Pad98, + FSS01, Sam99] for more details and experimental results. + +3.1 Asymmetric Capacity + + The problems that degrade unidirectional transfer performance when + the forward and return paths have very different capacities depend on + the characteristics of the upstream link. Two types of situations + arise for unidirectional traffic over such network paths: when the + upstream bottleneck link has sufficient queuing to prevent packet + (ACK) losses, and when the upstream bottleneck link has a small + buffer. Each is considered in turn. + + + +Balakrishnan et. al. Best Current Practice [Page 5] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + If the upstream bottleneck link has deep queues, so that this does + not drop ACKs in the reverse direction, then performance is a strong + function of the normalized bandwidth ratio, k. For example, for a 10 + Mbps downstream link and a 50 Kbps upstream link, the raw capacity + ratio is 200. With 1000-byte data packets and 40-byte ACKs, the + ratio of the packet sizes is 25. This implies that k is 200/25 = 8. + Thus, if the receiver acknowledges more frequently than one ACK every + 8 (k) data packets, the upstream link will become saturated before + the downstream link, limiting the throughput in the forward + direction. Note that, the achieved TCP throughput is determined by + the minimum of the receiver advertised window or TCP congestion + window, cwnd [RFC2581]. + + If ACKs are not dropped (at the upstream bottleneck link) and k > 1 + or k > 0.5 when delayed ACKs are used [RFC1122], TCP ACK-clocking + breaks down. Consider two data packets transmitted by the sender in + quick succession. En route to the receiver, these packets get spaced + apart according to the capacity of the smallest bottleneck link in + the forward direction. The principle of ACK clocking is that the + ACKs generated in response to receiving these data packets reflects + this temporal spacing all the way back to the sender, enabling it to + transmit new data packets that maintain the same spacing [Jac88]. ACK + clocking with delayed ACKs, reflects the spacing between data packets + that actually trigger ACKs. However, the limited upstream capacity + and queuing at the upstream bottleneck router alters the inter-ACK + spacing of the reverse path, and hence that observed at the sender. + When ACKs arrive at the upstream bottleneck link at a faster rate + than the link can support, they get queued behind one another. The + spacing between them when they emerge from the link is dilated with + respect to their original spacing, and is a function of the upstream + bottleneck capacity. Thus the TCP sender clocks out new data packets + at a slower rate than if there had been no queuing of ACKs. The + performance of the connection is no longer dependent on the + downstream bottleneck link alone; instead, it is throttled by the + rate of arriving ACKs. As a side effect, the sender's rate of cwnd + growth also slows down. + + A second side effect arises when the upstream bottleneck link on the + reverse path is saturated. The saturated link causes persistent + queuing of packets, leading to an increasing path Round Trip Time + (RTT) [RFC2998] observed by all end hosts using the bottleneck link. + This can impact the protocol control loops, and may also trigger + false time out (underestimation of the path RTT by the sending host). + + A different situation arises when the upstream bottleneck link has a + relatively small amount of buffer space to accommodate ACKs. As the + transmission window grows, this queue fills, and ACKs are dropped. If + the receiver were to acknowledge every packet, only one of every k + + + +Balakrishnan et. al. Best Current Practice [Page 6] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + ACKs would get through to the sender, and the remaining (k-1) are + dropped due to buffer overflow at the upstream link buffer (here k is + the normalized bandwidth ratio as before). In this case, the reverse + bottleneck link capacity and slow ACK arrival rate are not directly + responsible for any degraded performance. However, the infrequency + of ACKs leads to three reasons for degraded performance: + + 1. The sender transmits data in large bursts of packets, limited only + by the available cwnd. If the sender receives only one ACK in k, + it transmits data in bursts of k (or more) packets because each + ACK shifts the sliding window by at least k (acknowledged) data + packets (TCP data segments). This increases the likelihood of + data packet loss along the forward path especially when k is + large, because routers do not handle large bursts of packets well. + + 2. Current TCP sender implementations increase their cwnd by counting + the number of ACKs they receive and not by how much data is + actually acknowledged by each ACK. The later approach, also known + as byte counting (section 4.7), is a standard implementation + option for cwnd increase during the congestion avoidance period + [RFC2581]. Thus fewer ACKs imply a slower rate of growth of the + cwnd, which degrades performance over long-delay connections. + + 3. The sender TCP's Fast Retransmission and Fast Recovery algorithms + [RFC2581] are less effective when ACKs are lost. The sender may + possibly not receive the threshold number of duplicate ACKs even + if the receiver transmits more than the DupACK threshold (> 3 + DupACKs) [RFC2581]. Furthermore, the sender may possibly not + receive enough duplicate ACKs to adequately inflate its cwnd + during Fast Recovery. + +3.2 MAC Protocol Interactions + + The interaction of TCP with MAC protocols may degrade end-to-end + performance. Variable round-trip delays and ACK queuing are the main + symptoms of this problem. + + One example is the impact on terrestrial wireless networks [Bal98]. A + high per-packet overhead may arise from the need for communicating + link nodes to first synchronise (e.g., via a Ready To Send / Clear to + Send (RTS/CTS) protocol) before communication and the significant + turn-around time for the wireless channel. This overhead is + variable, since the RTS/CTS exchange may need to back-off + exponentially when the remote node is busy (e.g., engaged in a + conversation with a different node). This leads to large and + variable communication latencies in packet-radio networks. + + + + + +Balakrishnan et. al. Best Current Practice [Page 7] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + An asymmetric workload (more downstream than upstream traffic) may + cause ACKs to be queued in some wireless nodes (especially in the end + host modems), exacerbating the variable latency. Queuing may also + occur in other shared media, e.g., cable modem uplinks, BoD access + systems often employed on shared satellite channels. + + Variable latency and ACK queuing reduces the smoothness of the TCP + data flow. In particular, ACK traffic can interfere with the flow of + data packets, increasing the traffic load of the system. + + TCP measures the path RTT, and from this calculates a smoothed RTT + estimate (srtt) and a linear deviation, rttvar. These are used to + estimate a path retransmission timeout (RTO) [RFC2988], set to srtt + + 4*rttvar. For most wired TCP connections, the srtt remains constant + or has a low linear deviation. The RTO therefore tracks the path + RTT, and the TCP sender will respond promptly when multiple losses + occur in a window. In contrast, some wireless networks exhibit a + high variability in RTT, causing the RTO to significantly increase + (e.g., on the order of 10 seconds). Paths traversing multiple + wireless hops are especially vulnerable to this effect, because this + increases the probability that the intermediate nodes may already be + engaged in conversation with other nodes. The overhead in most MAC + schemes is a function of both the number and size of packets. + However, the MAC contention problem is a significant function of the + number of packets (e.g., ACKs) transmitted rather than their size. + In other words, there is a significant cost to transmitting a packet + regardless of packet size. + + Experiments conducted on the Ricochet packet radio network in 1996 + and 1997 demonstrated the impact of radio turnarounds and the + corresponding increased RTT variability, resulting in degraded TCP + performance. It was not uncommon for TCP connections to experience + timeouts of 9 - 12 seconds, with the result that many connections + were idle for a significant fraction of their lifetime (e.g., + sometimes 35% of the total transfer time). This leads to under- + utilization of the available capacity. These effects may also occur + in other wireless subnetworks. + +3.3 Bidirectional Traffic + + Bidirectional traffic arises when there are simultaneous TCP + transfers in the forward and reverse directions over an asymmetric + network path, e.g., a user who sends an e-mail message in the reverse + direction while simultaneously receiving a web page in the forward + direction. To simplify the discussion, only one TCP connection in + each direction is considered. In many practical cases, several + simultaneous connections need to share the available capacity, + increasing the level of congestion. + + + +Balakrishnan et. al. Best Current Practice [Page 8] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + Bidirectional traffic makes the effects discussed in section 3.1 more + pronounced, because part of the upstream link bandwidth is consumed + by the reverse transfer. This effectively increases the degree of + bandwidth asymmetry. Other effects also arise due to the interaction + between data packets of the reverse transfer and ACKs of the forward + transfer. Suppose at the time the forward TCP connection is + initiated, the reverse TCP connection has already saturated the + bottleneck upstream link with data packets. There is then a high + probability that many ACKs of the new forward TCP connection will + encounter a full upstream link buffer and hence get dropped. Even + after these initial problems, ACKs of the forward connection could + get queued behind large data packets of the reverse connection. The + larger data packets may have correspondingly long transmission times + (e.g., it takes about 280 ms to transmit a 1 Kbyte data packet over a + 28.8 kbps line). This causes the forward transfer to stall for long + periods of time. It is only at times when the reverse connection + loses packets (due to a buffer overflow at an intermediate router) + and slows down, that the forward connection gets the opportunity to + make rapid progress and build up its cwnd. + + When ACKs are queued behind other traffic for appreciable periods of + time, the burst nature of TCP traffic and self-synchronizing effects + can result in an effect known as ACK Compression [ZSC91], which + reduces the throughput of TCP. It occurs when a series of ACKs, in + one direction are queued behind a burst of other packets (e.g., data + packets traveling in the same direction) and become compressed in + time. This results in an intense burst of data packets in the other + direction, in response to the burst of compressed ACKs arriving at + the server. This phenomenon has been investigated in detail for + bidirectional traffic, and recent analytical work [LMS97] has + predicted ACK Compression may also result from bi-directional + transmission with asymmetry, and was observed in practical asymmetric + satellite subnetworks [FSS01]. In the case of extreme asymmetry + (k>>1), the inter-ACK spacing can increase due to queuing (section + 3.1), resulting in ACK dilation. + + In summary, sharing of the upstream bottleneck link by multiple flows + (e.g., IP flows to the same end host, or flows to a number of end + hosts sharing a common upstream link) increases the level of ACK + Congestion. The presence of bidirectional traffic exacerbates the + constraints introduced by bandwidth asymmetry because of the adverse + interaction between (large) data packets of a reverse direction + connection and the ACKs of a forward direction connection. + + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 9] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +3.4 Loss in Asymmetric Network Paths + + Loss may occur in either the forward or reverse direction. For data + transfer in the forward direction this results respectively in loss + of data packets and ACK packets. Loss of ACKs is less significant + than loss of data packets, because it generally results in stretch + ACKs [CR98, FSS01]. + + In the case of long delay paths, a slow upstream link [RFC3150] can + lead to another complication when the end host uses TCP large windows + [RFC1323] to maximize throughput in the forward direction. Loss of + data packets on the forward path, due to congestion, or link loss, + common for some wireless links, will generate a large number of + back-to-back duplicate ACKs (or TCP SACK packets [RFC2018]), for each + correctly received data packet following a loss. The TCP sender + employs Fast Retransmission and Recovery [RFC2581] to recover from + the loss, but even if this is successful, the ACK to the + retransmitted data segment may be significantly delayed by other + duplicate ACKs still queued at the upstream link buffer. This can + ultimately lead to a timeout [RFC2988] and a premature end to the TCP + Slow Start [RFC2581]. This results in poor forward path throughput. + Section 5.3 describes some mitigations to counter this. + +4. Improving TCP Performance using Host Mitigations + + There are two key issues that need to be addressed to improve TCP + performance over asymmetric networks. The first is to manage the + capacity of the upstream bottleneck link, used by ACKs and possibly + other traffic. A number of techniques exist which work by reducing + the number of ACKs that flow in the reverse direction. This has the + side effect of potentially destroying the desirable self-clocking + property of the TCP sender where transmission of new data packets is + triggered by incoming ACKs. Thus, the second issue is to avoid any + adverse impact of infrequent ACKs. + + Each of these issues can be handled by local link-layer solutions + and/or by end-to-end techniques. This section discusses end-to-end + modifications. Some techniques require TCP receiver changes + (sections 4.1 4.4, 4.5), some require TCP sender changes (sections + 4.6, 4.7), and a pair requires changes to both the TCP sender and + receiver (sections 4.2, 4.3). One technique requires a sender + modification at the receiving host (section 4.8). The techniques may + be used independently, however some sets of techniques are + complementary, e.g., pacing (section 4.6) and byte counting (section + 4.7) which have been bundled into a single TCP Sender Adaptation + scheme [BPK99]. + + + + + +Balakrishnan et. al. Best Current Practice [Page 10] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + It is normally envisaged that these changes would occur in the end + hosts using the asymmetric path, however they could, and have, been + used in a middle-box or Protocol Enhancing Proxy (PEP) [RFC3135] + employing split TCP. This document does not discuss the issues + concerning PEPs. Section 4 describes several techniques, which do + not require end-to-end changes. + +4.1 Modified Delayed ACKs + + There are two standard methods that can be used by TCP receivers to + generate acknowledgments. The method outlined in [RFC793] generates + an ACK for each incoming data segment (i.e., d=1). [RFC1122] states + that hosts should use "delayed acknowledgments". Using this + algorithm, an ACK is generated for at least every second full-sized + segment (d=2), or if a second full-sized segment does not arrive + within a given timeout (which must not exceed 500 ms [RFC1122], and + is typically less than 200 ms). Relaxing the latter constraint + (i.e., allowing d>2) may generate Stretch ACKs [RFC2760]. This + provides a possible mitigation, which reduces the rate at which ACKs + are returned by the receiver. An implementer should only deviate + from this requirement after careful consideration of the implications + [RFC2581]. + + Reducing the number of ACKs per received data segment has a number of + undesirable effects including: + + (i) Increased path RTT + (ii) Increased time for TCP to open the cwnd + (iii) Increased TCP sender burst size, since cwnd opens in larger + steps + + In addition, a TCP receiver is often unable to determine an optimum + setting for a large d, since it will normally be unaware of the + details of the properties of the links that form the path in the + reverse direction. + + RECOMMENDATION: A TCP receiver must use the standard TCP algorithm + for sending ACKs as specified in [RFC2581]. That is, it may delay + sending an ACK after it receives a data segment [RFC1122]. When ACKs + are delayed, the receiver must generate an ACK within 500 ms and the + ACK should be generated for at least every second full sized segment + (MSS) of received data [RFC2581]. This will result in an ACK delay + factor (d) that does not exceed a value of 2. Changing the algorithm + would require a host modification to the TCP receiver and awareness + by the receiving host that it is using a connection with an + asymmetric path. Such a change has many drawbacks in the general + case and is currently not recommended for use within the Internet. + + + + +Balakrishnan et. al. Best Current Practice [Page 11] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +4.2 Use of Large MSS + + A TCP sender that uses a large Maximum Segment Size (MSS) reduces the + number of ACKs generated per transmitted byte of data. + + Although individual subnetworks may support a large MTU, the majority + of current Internet links employ an MTU of approx 1500 bytes (that of + Ethernet). By setting the Don't Fragment (DF) bit in the IP header, + Path MTU (PMTU) discovery [RFC1191] may be used to determine the + maximum packet size (and hence MSS) a sender can use on a given + network path without being subjected to IP fragmentation, and + provides a way to automatically select a suitable MSS for a specific + path. This also guarantees that routers will not perform IP + fragmentation of normal data packets. + + By electing not to use PMTU Discovery, an end host may choose to use + IP fragmentation by routers along the path in the forward direction + [RFC793]. This allows an MSS larger than smallest MTU along the + path. However, this increases the unit of error recovery (TCP + segment) above the unit of transmission (IP packet). This is not + recommended, since it can increase the number of retransmitted + packets following loss of a single IP packet, leading to reduced + efficiency, and potentially aggravating network congestion [Ken87]. + Choosing an MSS larger than the forward path minimum MTU also permits + the sender to transmit more initial packets (a burst of IP fragments + for each TCP segment) when a session starts or following RTO expiry, + increasing the aggressiveness of the sender compared to standard TCP + [RFC2581]. This can adversely impact other standard TCP sessions + that share a network path. + + RECOMMENDATION: + + A larger forward path MTU is desirable for paths with bandwidth + asymmetry. Network providers may use a large MTU on links in the + forward direction. TCP end hosts using Path MTU discovery may be + able to take advantage of a large MTU by automatically selecting an + appropriate larger MSS, without requiring modification. The use of + Path MTU discovery [RFC1191] is therefore recommended. + + Increasing the unit of error recovery and congestion control (MSS) + above the unit of transmission and congestion loss (the IP packet) by + using a larger end host MSS and IP fragmentation in routers is not + recommended. + + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 12] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +4.3 ACK Congestion Control + + ACK Congestion Control (ACC) is an experimental technique that + operates end to end. ACC extends congestion control to ACKs, since + they may make non-negligible demands on resources (e.g., packet + buffers, and MAC transmission overhead) at an upstream bottleneck + link. It has two parts: (a) a network mechanism indicating to the + receiver that the ACK path is congested, and (b) the receiver's + response to such an indication. + + A router feeding an upstream bottleneck link may detect incipient + congestion, e.g., using an algorithm based on RED (Random Early + Detection) [FJ93]. This may track the average queue size over a time + window in the recent past. If the average exceeds a threshold, the + router may select a packet at random. If the packet IP header has + the Explicit Congestion Notification Capable Transport (ECT) bit set, + the router may mark the packet, i.e., sets an Explicit Congestion + Notification (ECN) [RFC3168] bit(s) in the IP header, otherwise the + packet is normally dropped. The ECN notification received by the end + host is reflected back to the sending TCP end host, to trigger + congestion avoidance [RFC3168]. Note that routers implementing RED + with ECN, do not eliminate packet loss, and may drop a packet (even + when the ECT bit is set). It is also possible to use an algorithm + other than RED to decide when to set the ECN bit. + + ACC extends ECN so that both TCP data packets and ACKs set the ECT + bit and are thus candidates for being marked with an ECN bit. + Therefore, upon receiving an ACK with the ECN bit set [RFC3168], a + TCP receiver reduces the rate at which it sends ACKs. It maintains a + dynamically varying delayed-ACK factor, d, and sends one ACK for + every d data packets received. When it receives a packet with the + ECN bit set, it increases d multiplicatively, thereby + multiplicatively decreasing the frequency of ACKs. For each + subsequent RTT (e.g., determined using the TCP RTTM option [RFC1323]) + during which it does not receive an ECN, it linearly decreases the + factor d, increasing the frequency of ACKs. Thus, the receiver + mimics the standard congestion control behavior of TCP senders in the + manner in which it sends ACKs. + + The maximum value of d is determined by the TCP sender window size, + which could be conveyed to the receiver in a new (experimental) TCP + option. The receiver should send at least one ACK (preferably more) + for each window of data from the sender (i.e., d < (cwnd/mss)) to + prevent the sender from stalling until the receiver's delayed ACK + timer triggers an ACK to be sent. + + + + + + +Balakrishnan et. al. Best Current Practice [Page 13] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + RECOMMENDATION: ACK Congestion Control (ACC) is an experimental + technique that requires TCP sender and receiver modifications. There + is currently little experience of using such techniques in the + Internet. Future versions of TCP may evolve to include this or + similar techniques. These are the subject of ongoing research. ACC + is not recommended for use within the Internet in its current form. + +4.4 Window Prediction Mechanism + + The Window Prediction Mechanism (WPM) is a TCP receiver side + mechanism [CLP98] that uses a dynamic ACK delay factor (varying d) + resembling the ACC scheme (section 4.3). The TCP receiver + reconstructs the congestion control behavior of the TCP sender by + predicting a cwnd value. This value is used along with the allowed + window to adjust the receiver's value of d. WPM accommodates for + unnecessary retransmissions resulting from losses due to link errors. + + RECOMMENDATION: Window Prediction Mechanism (WPM) is an experimental + TCP receiver side modification. There is currently little experience + of using such techniques in the Internet. Future versions of TCP may + evolve to include this or similar techniques. These are the subjects + of ongoing research. WPM is not recommended for use within the + Internet in its current form. + +4.5 Acknowledgement based on Cwnd Estimation. + + Acknowledgement based on Cwnd Estimation (ACE) [MJW00] attempts to + measure the cwnd at the TCP receiver and maintain a varying ACK delay + factor (d). The cwnd is estimated by counting the number of packets + received during a path RTT. The technique may improve accuracy of + prediction of a suitable cwnd. + + RECOMMENDATION: Acknowledgement based on Cwnd Estimation (ACE) is an + experimental TCP receiver side modification. There is currently + little experience of using such techniques in the Internet. Future + versions of TCP may evolve to include this or similar techniques. + These are the subject of ongoing research. ACE is not recommended + for use within the Internet in its current form. + +4.6 TCP Sender Pacing + + Reducing the frequency of ACKs may alleviate congestion of the + upstream bottleneck link, but can lead to increased size of TCP + sender bursts (section 4.1). This may slow the growth of cwnd, and + is undesirable when used over shared network paths since it may + significantly increase the maximum number of packets in the + bottleneck link buffer, potentially resulting in an increase in + network congestion. This may also lead to ACK Compression [ZSC91]. + + + +Balakrishnan et. al. Best Current Practice [Page 14] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + TCP Pacing [AST00], generally referred to as TCP Sender pacing, + employs an adapted TCP sender to alleviating transmission burstiness. + A bound is placed on the maximum number of packets the TCP sender can + transmit back-to-back (at local line rate), even if the window(s) + allow the transmission of more data. If necessary, more bursts of + data packets are scheduled for later points in time computed based on + the transmission rate of the TCP connection. The transmission rate + may be estimated from the ratio cwnd/srtt. Thus, large bursts of + data packets get broken up into smaller bursts spread over time. + + A subnetwork may also provide pacing (e.g., Generic Traffic Shaping + (GTS)), but implies a significant increase in the per-packet + processing overhead and buffer requirement at the router where + shaping is performed (section 5.3.3). + + RECOMMENDATIONS: TCP Sender Pacing requires a change to + implementation of the TCP sender. It may be beneficial in the + Internet and will significantly reduce the burst size of packets + transmitted by a host. This successfully mitigates the impact of + receiving Stretch ACKs. TCP Sender Pacing implies increased + processing cost per packet, and requires a prediction algorithm to + suggest a suitable transmission rate. There are hence performance + trade-offs between end host cost and network performance. + Specification of efficient algorithms remains an area of ongoing + research. Use of TCP Sender Pacing is not expected to introduce new + problems. It is an experimental mitigation for TCP hosts that may + control the burstiness of transmission (e.g., resulting from Type 1 + techniques, section 5.1.2), however it is not currently widely + deployed. It is not recommended for use within the Internet in its + current form. + +4.7 TCP Byte Counting + + The TCP sender can avoid slowing growth of cwnd by taking into + account the volume of data acknowledged by each ACK, rather than + opening the cwnd based on the number of received ACKs. So, if an ACK + acknowledges d data packets (or TCP data segments), the cwnd would + grow as if d separate ACKs had been received. This is called TCP + Byte Counting [RFC2581, RFC2760]. (One could treat the single ACK as + being equivalent to d/2, instead of d ACKs, to mimic the effect of + the TCP delayed ACK algorithm.) This policy works because cwnd + growth is only tied to the available capacity in the forward + direction, so the number of ACKs is immaterial. + + This may mitigate the impact of asymmetry when used in combination + with other techniques (e.g., a combination of TCP Pacing + (section4.6), and ACC (section 4.3) associated with a duplicate ACK + threshold at the receiver.) + + + +Balakrishnan et. al. Best Current Practice [Page 15] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + The main issue is that TCP byte counting may generate undesirable + long bursts of TCP packets at the sender host line rate. An + implementation must also consider that data packets in the forward + direction and ACKs in the reverse direction may both travel over + network paths that perform some amount of packet reordering. + Reordering of IP packets is currently common, and may arise from + various causes [BPS00]. + + RECOMMENDATION: TCP Byte Counting requires a small TCP sender + modification. In its simplest form, it can generate large bursts of + TCP data packets, particularly when Stretch ACKs are received. + Unlimited byte counting is therefore not allowed [RFC2581] for use + within the Internet. + + It is therefore strongly recommended [RFC2581, RFC2760] that any byte + counting scheme should include a method to mitigate the potentially + large bursts of TCP data packets the algorithm can cause (e.g., TCP + Sender Pacing (section 4.6), ABC [abc-ID]). If the burst size or + sending rate of the TCP sender can be controlled then the scheme may + be beneficial when Stretch ACKs are received. Determining safe + algorithms remain an area of ongoing research. Further + experimentation will then be required to assess the success of these + safeguards, before they can be recommended for use in the Internet. + +4.8 Backpressure + + Backpressure is a technique to enhance the performance of + bidirectional traffic for end hosts directly connected to the + upstream bottleneck link [KVR98]. A limit is set on how many data + packets of upstream transfers can be enqueued at the upstream + bottleneck link. In other words, the bottleneck link queue exerts + 'backpressure' on the TCP (sender) layer. This requires a modified + implementation, compared to that currently deployed in many TCP + stacks. Backpressure ensures that ACKs of downstream connections do + not get starved at the upstream bottleneck, thereby improving + performance of the downstream connections. Similar generic schemes + that may be implemented in hosts/routers are discussed in section + 5.4. + + Backpressure can be unfair to a reverse direction connection and make + its throughput highly sensitive to the dynamics of the forward + connection(s). + + RECOMMENDATION: Backpressure requires an experimental modification to + the sender protocol stack of a host directly connected to an upstream + bottleneck link. Use of backpressure is an implementation issue, + rather than a network protocol issue. Where backpressure is + implemented, the optimizations described in this section could be + + + +Balakrishnan et. al. Best Current Practice [Page 16] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + desirable and can benefit bidirectional traffic for hosts. + Specification of safe algorithms for providing backpressure is still + a subject of ongoing research. The technique is not recommended for + use within the Internet in its current form. + +5. Improving TCP performance using Transparent Modifications + + Various link and network layer techniques have been suggested to + mitigate the effect of an upstream bottleneck link. These techniques + may provide benefit without modification to either the TCP sender or + receiver, or may alternately be used in conjunction with one or more + of the schemes identified in section 4. In this document, these + techniques are known as "transparent" [RFC3135], because at the + transport layer, the TCP sender and receiver are not necessarily + aware of their existence. This does not imply that they do not + modify the pattern and timing of packets as observed at the network + layer. The techniques are classified here into three types based on + the point at which they are introduced. + + Most techniques require the individual TCP connections passing over + the bottleneck link(s) to be separately identified and imply that + some per-flow state is maintained for active TCP connections. A link + scheduler may also be employed (section 5.4). The techniques (with + one exception, ACK Decimation (section 5.2.2) require: + + (i) Visibility of an unencrypted IP and TCP packet header (e.g., no + use of IPSec with payload encryption [RFC2406]). + (ii) Knowledge of IP/TCP options and ability to inspect packets with + tunnel encapsulations (e.g., [RFC2784]) or to suspend + processing of packets with unknown formats. + (iii) Ability to demultiplex flows (by using address/protocol/port + number, or an explicit flow-id). + + [RFC3135] describes a class of network device that provides more than + forwarding of packets, and which is known as a Protocol Enhancing + Proxy (PEP). A large spectrum of PEP devices exists, ranging from + simple devices (e.g., ACK filtering) to more sophisticated devices + (e.g., stateful devices that split a TCP connection into two separate + parts). The techniques described in section 5 of this document + belong to the simpler type, and do not inspect or modify any TCP or + UDP payload data. They also do not modify port numbers or link + addresses. Many of the risks associated with more complex PEPs do + not exist for these schemes. Further information about the operation + and the risks associated with using PEPs are described in [RFC3135]. + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 17] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +5.1 TYPE 0: Header Compression + + A client may reduce the volume of bits used to send a single ACK by + using compression [RFC3150, RFC3135]. Most modern dial-up modems + support ITU-T V.42 bulk compression. In contrast to bulk + compression, header compression is known to be very effective at + reducing the number of bits sent on the upstream link [RFC1144]. This + relies on the observation that most TCP packet headers vary only in a + few bit positions between successive packets in a flow, and that the + variations can often be predicted. + +5.1.1 TCP Header Compression + + TCP header compression [RFC1144] (sometimes known as V-J compression) + is a Proposed Standard describing use over low capacity links running + SLIP or PPP [RFC3150]. It greatly reduces the size of ACKs on the + reverse link when losses are infrequent (a situation that ensures + that the state of the compressor and decompressor are synchronized). + However, this alone does not address all of the asymmetry issues: + + (i) In some (e.g., wireless) subnetworks there is a significant + per-packet MAC overhead that is independent of packet size + (section 3.2). + (ii) A reduction in the size of ACKs does not prevent adverse + interaction with large upstream data packets in the presence + of bidirectional traffic (section 3.3). + (iii) TCP header compression cannot be used with packets that have + IP or TCP options (including IPSec [RFC2402, RFC2406], TCP + RTTM [RFC1323], TCP SACK [RFC2018], etc.). + (iv) The performance of header compression described by RFC1144 is + significantly degraded when compressed packets are lost. An + improvement, which can still incur significant penalty on + long network paths is described in [RFC2507]. This suggests + it should only be used on links (or paths) that experience a + low level of packet loss [RFC3150]. + (v) The normal implementation of Header Compression inhibits + compression when IP is used to support tunneling (e.g., L2TP, + GRE [RFC2794], IP-in-IP). The tunnel encapsulation + complicates locating the appropriate packet headers. Although + GRE allows Header Compression on the inner (tunneled) IP + header [RFC2784], this is not recommended, since loss of a + packet (e.g., due to router congestion along the tunnel path) + will result in discard of all packets for one RTT [RFC1144]. + + RECOMMENDATION: TCP Header Compression is a transparent modification + performed at both ends of the upstream bottleneck link. It offers no + benefit for flows employing IPSec [RFC2402, RFC2406], or when + additional protocol headers are present (e.g., IP or TCP options, + + + +Balakrishnan et. al. Best Current Practice [Page 18] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + and/or tunnel encapsulation headers). The scheme is widely + implemented and deployed and used over Internet links. It is + recommended to improve TCP performance for paths that have a low-to- + medium bandwidth asymmetry (e.g., k<10). + + In the form described in [RFC1144], TCP performance is degraded when + used over links (or paths) that may exhibit appreciable rates of + packet loss [RFC3150]. It may also not provide significant + improvement for upstream links with bidirectional traffic. It is + therefore not desirable for paths that have a high bandwidth + asymmetry (e.g., k>10). + +5.1.2 Alternate Robust Header Compression Algorithms + + TCP header compression [RFC1144] and IP header compression [RFC2507] + do not perform well when subject to packet loss. Further, they do + not compress packets with TCP option fields (e.g., SACK [RFC2018] and + Timestamp (RTTM) [RFC1323]). However, recent work on more robust + schemes suggest that a new generation of compression algorithms may + be developed which are much more robust. The IETF ROHC working group + has specified compression techniques for UDP-based traffic [RFC3095] + and is examining a number of schemes that may provide improve TCP + header compression. These could be beneficial for asymmetric network + paths. + + RECOMMENDATION: Robust header compression is a transparent + modification that may be performed at both ends of an upstream + bottleneck link. This class of techniques may also be suited to + Internet paths that suffer low levels of re-ordering. The techniques + benefit paths with a low-to-medium bandwidth asymmetry (e.g., k>10) + and may be robust to packet loss. + + Selection of suitable compression algorithms remains an area of + ongoing research. It is possible that schemes may be derived which + support IPSec authentication, but not IPSec payload encryption. Such + schemes do not alone provide significant improvement in asymmetric + networks with a high asymmetry and/or bidirectional traffic. + +5.2 TYPE 1: Reverse Link Bandwidth Management + + Techniques beyond Type 0 header compression are required to address + the performance problems caused by appreciable asymmetry (k>>1). One + set of techniques is implemented only at one point on the reverse + direction path, within the router/host connected to the upstream + bottleneck link. These use flow class or per-flow queues at the + upstream link interface to manage the queue of packets waiting for + transmission on the bottleneck upstream link. + + + + +Balakrishnan et. al. Best Current Practice [Page 19] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + This type of technique bounds the upstream link buffer queue size, + and employs an algorithm to remove (discard) excess ACKs from each + queue. This relies on the cumulative nature of ACKs (section 4.1). + Two approaches are described which employ this type of mitigation. + +5.2.1 ACK Filtering + + ACK Filtering (AF) [DMT96, BPK99] (also known as ACK Suppression + [SF98, Sam99, FSS01]) is a TCP-aware link-layer technique that + reduces the number of ACKs sent on the upstream link. This technique + has been deployed in specific production networks (e.g., asymmetric + satellite networks [ASB96]). The challenge is to ensure that the + sender does not stall waiting for ACKs, which may happen if ACKs are + indiscriminately removed. + + When an ACK from the receiver is about to be enqueued at a upstream + bottleneck link interface, the router or the end host link layer (if + the host is directly connected to the upstream bottleneck link) + checks the transmit queue(s) for older ACKs belonging to the same TCP + connection. If ACKs are found, some (or all of them) are removed + from the queue, reducing the number of ACKs. + + Some ACKs also have other functions in TCP [RFC1144], and should not + be deleted to ensure normal operation. AF should therefore not + delete an ACK that has any data or TCP flags set (SYN, RST, URG, and + FIN). In addition, it should avoid deleting a series of 3 duplicate + ACKs that indicate the need for Fast Retransmission [RFC2581] or ACKs + with the Selective ACK option (SACK)[RFC2018] from the queue to avoid + causing problems to TCP's data-driven loss recovery mechanisms. + Appropriate treatment is also needed to preserve correct operation of + ECN feedback (carried in the TCP header) [RFC3168]. + + A range of policies to filter ACKs may be used. These may be either + deterministic or random (similar to a random-drop gateway, but should + take into consideration the semantics of the items in the queue). + Algorithms have also been suggested to ensure a minimum ACK rate to + guarantee the TCP sender window is updated [Sam99, FSS01], and to + limit the number of data packets (TCP segments) acknowledged by a + Stretch ACK. Per-flow state needs to be maintained only for + connections with at least one packet in the queue (similar to FRED + [LM97]). This state is soft [Cla88], and if necessary, can easily be + reconstructed from the contents of the queue. + + The undesirable effect of delayed DupACKs (section 3.4) can be + reduced by deleting duplicate ACKs above a threshold value [MJW00, + CLP98] allowing Fast Retransmission, but avoiding early TCP timeouts, + which may otherwise result from excessive queuing of DupACKs. + + + + +Balakrishnan et. al. Best Current Practice [Page 20] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + Future schemes may include more advanced rules allowing removal of + selected SACKs [RFC2018]. Such a scheme could prevent the upstream + link queue from becoming filled by back-to-back ACKs with SACK + blocks. Since a SACK packet is much larger than an ACK, it would + otherwise add significantly to the path delay in the reverse + direction. Selection of suitable algorithms remains an ongoing area + of research. + + RECOMMENDATION: ACK Filtering requires a modification to the upstream + link interface. The scheme has been deployed in some networks where + the extra processing overhead (per ACK) may be compensated for by + avoiding the need to modify TCP. ACK Filtering can generate Stretch + ACKs resulting in large bursts of TCP data packets. Therefore on its + own, it is not recommended for use in the general Internet. + + ACK Filtering when used in combination with a scheme to mitigate the + effect of Stretch ACKs (i.e., control TCP sender burst size) is + recommended for paths with appreciable asymmetry (k>1) and/or with + bidirectional traffic. Suitable algorithms to support IPSec + authentication, SACK, and ECN remain areas of ongoing research. + +5.2.2 ACK Decimation + + ACK Decimation is based on standard router mechanisms. By using an + appropriate configuration of (small) per-flow queues and a chosen + dropping policy (e.g., Weighted Fair Queuing, WFQ) at the upstream + bottleneck link, a similar effect to AF (section 5.2.1) may be + obtained, but with less control of the actual packets which are + dropped. + + In this scheme, the router/host at the bottleneck upstream link + maintains per-flow queues and services them fairly (or with + priorities) by queuing and scheduling of ACKs and data packets in the + reverse direction. A small queue threshold is maintained to drop + excessive ACKs from the tail of each queue, in order to reduce ACK + Congestion. The inability to identify special ACK packets (c.f., AF) + introduces some major drawbacks to this approach, such as the + possibility of losing DupACKs, FIN/ACK, RST packets, or packets + carrying ECN information [RFC3168]. Loss of these packets does not + significantly impact network congestion, but does adversely impact + the performance of the TCP session observing the loss. + + A WFQ scheduler may assign a higher priority to interactive traffic + (providing it has a mechanism to identify such traffic) and provide a + fair share of the remaining capacity to the bulk traffic. In the + presence of bidirectional traffic, and with a suitable scheduling + policy, this may ensure fairer sharing for ACK and data packets. An + increased forward transmission rate is achieved over asymmetric links + + + +Balakrishnan et. al. Best Current Practice [Page 21] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + by an increased ACK Decimation rate, leading to generation of Stretch + ACKs. As in AF, TCP sender burst size increases when Stretch ACKs + are received unless other techniques are used in combination with + this technique. + + This technique has been deployed in specific networks (e.g., a + network with high bandwidth asymmetry supporting high-speed data + services to in-transit mobile hosts [Seg00]). Although not optimal, + it offered a potential mitigation applicable when the TCP header is + difficult to identify or not visible to the link layer (e.g., due to + IPSec encryption). + + RECOMMENDATION: ACK Decimation uses standard router mechanisms at the + upstream link interface to constrain the rate at which ACKs are fed + to the upstream link. The technique is beneficial with paths having + appreciable asymmetry (k>1). It is however suboptimal, in that it + may lead to inefficient TCP error recovery (and hence in some cases + degraded TCP performance), and provides only crude control of link + behavior. It is therefore recommended that where possible, ACK + Filtering should be used in preference to ACK Decimation. + + When ACK Decimation is used on paths with an appreciable asymmetry + (k>1) (or with bidirectional traffic) it increases the burst size of + the TCP sender, use of a scheme to mitigate the effect of Stretch + ACKs or control burstiness is therefore strongly recommended. + +5.3 TYPE 2: Handling Infrequent ACKs + + TYPE 2 mitigations perform TYPE 1 upstream link bandwidth management, + but also employ a second active element which mitigates the effect of + the reduced ACK rate and burstiness of ACK transmission. This is + desirable when end hosts use standard TCP sender implementations + (e.g., those not implementing the techniques in sections 4.6, 4.7). + + Consider a path where a TYPE 1 scheme forwards a Stretch ACK covering + d TCP packets (i.e., where the acknowledgement number is d*MSS larger + than the last ACK received by the TCP sender). When the TCP sender + receives this ACK, it can send a burst of d (or d+1) TCP data + packets. The sender is also constrained by the current cwnd. + Received ACKs also serve to increase cwnd (by at most one MSS). + + A TYPE 2 scheme mitigates the impact of the reduced ACK frequency + resulting when a TYPE 1 scheme is used. This is achieved by + interspersing additional ACKs before each received Stretch ACK. The + additional ACKs, together with the original ACK, provide the TCP + sender with sufficient ACKs to allow the TCP cwnd to open in the same + way as if each of the original ACKs sent by the TCP receiver had been + forwarded by the reverse path. In addition, by attempting to restore + + + +Balakrishnan et. al. Best Current Practice [Page 22] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + the spacing between ACKs, such a scheme can also restore the TCP + self-clocking behavior, and reduce the TCP sender burst size. Such + schemes need to ensure conservative behavior (i.e., should not + introduce more ACKs than were originally sent) and reduce the + probability of ACK Compression [ZSC91]. + + The action is performed at two points on the return path: the + upstream link interface (where excess ACKs are removed), and a point + further along the reverse path (after the bottleneck upstream + link(s)), where replacement ACKs are inserted. This attempts to + reconstruct the ACK stream sent by the TCP receiver when used in + combination with AF (section 5.2.1), or ACK Decimation (section + 5.2.2). + + TYPE 2 mitigations may be performed locally at the receive interface + directly following the upstream bottleneck link, or may alternatively + be applied at any point further along the reverse path (this is not + necessarily on the forward path, since asymmetric routing may employ + different forward and reverse internet paths). Since the techniques + may generate multiple ACKs upon reception of each individual Stretch + ACK, it is strongly recommended that the expander implements a scheme + to prevent exploitation as a "packet amplifier" in a Denial-of- + Service (DoS) attack (e.g., to verify the originator of the ACK). + Identification of the sender could be accomplished by appropriately + configured packet filters and/or by tunnel authentication procedures + (e.g., [RFC2402, RFC2406]). A limit on the number of reconstructed + ACKs that may be generated from a single packet may also be + desirable. + +5.3.1 ACK Reconstruction + + ACK Reconstruction (AR) [BPK99] is used in conjunction with AF + (section 5.2.1). AR deploys a soft-state [Cla88] agent called an ACK + Reconstructor on the reverse path following the upstream bottleneck + link. The soft-state can be regenerated if lost, based on received + ACKs. When a Stretch ACK is received, AR introduces additional ACKs + by filling gaps in the ACK sequence. Some potential Denial-of- + Service vulnerabilities may arise (section 6) and need to be + addressed by appropriate security techniques. + + The Reconstructor determines the number of additional ACKs, by + estimating the number of filtered ACKs. This uses implicit + information present in the received ACK stream by observing the ACK + sequence number of each received ACK. An example implementation + could set an ACK threshold, ackthresh, to twice the MSS (this assumes + the chosen MSS is known by the link). The factor of two corresponds + + + + + +Balakrishnan et. al. Best Current Practice [Page 23] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + to standard TCP delayed-ACK policy (d=2). Thus, if successive ACKs + arrive separated by delta, the Reconstructor regenerates a maximum of + ((delta/ackthresh) - 2) ACKs. + + To reduce the TCP sender burst size and allow the cwnd to increase at + a rate governed by the downstream link, the reconstructed ACKs must + be sent at a consistent rate (i.e., temporal spacing between + reconstructed ACKs). One method is for the Reconstructor to measure + the arrival rate of ACKs using an exponentially weighted moving + average estimator. This rate depends on the output rate from the + upstream link and on the presence of other traffic sharing the link. + The output of the estimator indicates the average temporal spacing + for the ACKs (and the average rate at which ACKs would reach the TCP + sender if there were no further losses or delays). This may be used + by the Reconstructor to set the temporal spacing of reconstructed + ACKs. The scheme may also be used in combination with TCP sender + adaptation (e.g., a combination of the techniques in sections 4.6 and + 4.7). + + The trade-off in AR is between obtaining less TCP sender burstiness, + and a better rate of cwnd increase, with a reduction in RTT + variation, versus a modest increase in the path RTT. The technique + cannot perform reconstruction on connections using IPSec (AH + [RFC2402] or ESP [RFC2406]), since it is unable to generate + appropriate security information. It also cannot regenerate other + packet header information (e.g., the exact pattern of bits carried in + the IP packet ECN field [RFC3168] or the TCP RTTM option [RFC1323]). + + An ACK Reconstructor operates correctly (i.e., generates no spurious + ACKs and preserves the end-to-end semantics of TCP), providing: + + (i) the TCP receiver uses ACK Delay (d=2) [RFC2581] + (ii) the Reconstructor receives only in-order ACKs + (iii) all ACKs are routed via the Reconstructor + (iv) the Reconstructor correctly determines the TCP MSS used by + the session + (v) the packets do not carry additional header information (e.g., + TCP RTTM option [RFC1323], IPSec using AH [RFC2402]or ESP + [RFC2406]). + + RECOMMENDATION: ACK Reconstruction is an experimental transparent + modification performed on the reverse path following the upstream + bottleneck link. It is designed to be used in conjunction with a + TYPE 1 mitigation. It reduces the burst size of TCP transmission in + the forward direction, which may otherwise increase when TYPE 1 + schemes are used alone. It requires modification of equipment after + the upstream link (including maintaining per-flow soft state). The + scheme introduces implicit assumptions about the network path and has + + + +Balakrishnan et. al. Best Current Practice [Page 24] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + potential Denial-of-Service vulnerabilities (i.e., acting as a packet + amplifier); these need to be better understood and addressed by + appropriate security techniques. + + Selection of appropriate algorithms to pace the ACK traffic remains + an open research issue. There is also currently little experience of + the implications of using such techniques in the Internet, and + therefore it is recommended that this technique should not be used + within the Internet in its current form. + +5.3.2 ACK Compaction and Companding + + ACK Compaction and ACK Companding [SAM99, FSS01] are techniques that + operate at a point on the reverse path following the constrained ACK + bottleneck. Like AR (section 5.3.1), ACK Compaction and ACK + Companding are both used in conjunction with an AF technique (section + 5.2.1) and regenerate filtered ACKs, restoring the ACK stream. + However, they differ from AR in that they use a modified AF (known as + a compactor or compressor), in which explicit information is added to + all Stretch ACKs generated by the AF. This is used to explicitly + synchronize the reconstruction operation (referred to here as + expansion). + + The modified AF combines two modifications: First, when the + compressor deletes an ACK from the upstream bottleneck link queue, it + appends explicit information (a prefix) to the remaining ACK (this + ACK is marked to ensure it is not subsequently deleted). The + additional information contains details the conditions under which + ACKs were previously filtered. A variety of information may be + encoded in the prefix. This includes the number of ACKs deleted by + the AF and the average number of bytes acknowledged. This may + subsequently be used by an expander at the remote end of the tunnel. + Further timing information may also be added to control the pacing of + the regenerated ACKs [FSS01]. The temporal spacing of the filtered + ACKs may also be encoded. + + To encode the prefix requires the subsequent expander to recognize a + modified ACK header. This would normally limit the expander to + link-local operation (at the receive interface of the upstream + bottleneck link). If remote expansion is needed further along the + reverse path, a tunnel may be used to pass the modified ACKs to the + remote expander. The tunnel introduces extra overhead, however + networks with asymmetric capacity and symmetric routing frequently + already employ such tunnels (e.g., in a UDLR network [RFC3077], the + expander may be co-located with the feed router). + + + + + + +Balakrishnan et. al. Best Current Practice [Page 25] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + ACK expansion uses a stateless algorithm to expand the ACK (i.e., + each received packet is processed independently of previously + received packets). It uses the prefix information together with the + acknowledgment field in the received ACK, to produce an equivalent + number of ACKs to those previously deleted by the compactor. These + ACKs are forwarded to the original destination (i.e., the TCP + sender), preserving normal TCP ACK clocking. In this way, ACK + Compaction, unlike AR, is not reliant on specific ACK policies, nor + must it see all ACKs associated with the reverse path (e.g., it may + be compatible with schemes such as DAASS [RFC2760]). + + Some potential Denial-of-Service vulnerabilities may arise (section + 6) and need to be addressed by appropriate security techniques. The + technique cannot perform reconstruction on connections using IPSec, + since they are unable to regenerate appropriate security information. + It is possible to explicitly encode IPSec security information from + suppressed packets, allowing operation with IPSec AH, however this + remains an open research issue, and implies an additional overhead + per ACK. + + RECOMMENDATION: ACK Compaction and Companding are experimental + transparent modifications performed on the reverse path following the + upstream bottleneck link. They are designed to be used in + conjunction with a modified TYPE 1 mitigation and reduce the burst + size of TCP transmission in the forward direction, which may + otherwise increase when TYPE 1 schemes are used alone. + + The technique is desirable, but requires modification of equipment + after the upstream bottleneck link (including processing of a + modified ACK header). Selection of appropriate algorithms to pace + the ACK traffic also remains an open research issue. Some potential + Denial-of-Service vulnerabilities may arise with any device that may + act as a packet amplifier. These need to be addressed by appropriate + security techniques. There is little experience of using the scheme + over Internet paths. This scheme is a subject of ongoing research + and is not recommended for use within the Internet in its current + form. + +5.3.3 Mitigating TCP packet bursts generated by Infrequent ACKs + + The bursts of data packets generated when a Type 1 scheme is used on + the reverse direction path may be mitigated by introducing a router + supporting Generic Traffic Shaping (GTS) on the forward path [Seg00]. + GTS is a standard router mechanism implemented in many deployed + routers. This technique does not eliminate the bursts of data + generated by the TCP sender, but attempts to smooth out the bursts by + employing scheduling and queuing techniques, producing traffic which + resembles that when TCP Pacing is used (section 4.6). These + + + +Balakrishnan et. al. Best Current Practice [Page 26] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + techniques require maintaining per-flow soft-state in the router, and + increase per-packet processing overhead. Some additional buffer + capacity is needed to queue packets being shaped. + + To perform GTS, the router needs to select appropriate traffic + shaping parameters, which require knowledge of the network policy, + connection behavior and/or downstream bottleneck characteristics. GTS + may also be used to enforce other network policies and promote + fairness between competing TCP connections (and also UDP and + multicast flows). It also reduces the probability of ACK Compression + [ZSC91]. + + The smoothing of packet bursts reduces the impact of the TCP + transmission bursts on routers and hosts following the point at which + GTS is performed. It is therefore desirable to perform GTS near to + the sending host, or at least at a point before the first forward + path bottleneck router. + + RECOMMENDATIONS: Generic Traffic Shaping (GTS) is a transparent + technique employed at a router on the forward path. The algorithms + to implement GTS are available in widely deployed routers and may be + used on an Internet link, but do imply significant additional per- + packet processing cost. + + Configuration of a GTS is a policy decision of a network service + provider. When appropriately configured the technique will reduce + size of TCP data packet bursts, mitigating the effects of Type 1 + techniques. GTS is recommended for use in the Internet in + conjunction with type 1 techniques such as ACK Filtering (section + 5.2.1) and ACK Decimation (section 5.2.2). + +5.4 TYPE 3: Upstream Link Scheduling + + Many of the above schemes imply using per flow queues (or per + connection queues in the case of TCP) at the upstream bottleneck + link. Per-flow queuing (e.g., FQ, CBQ) offers benefit when used on + any slow link (where the time to transmit a packet forms an + appreciable part of the path RTT) [RFC3150]. Type 3 schemes offer + additional benefit when used with one of the above techniques. + +5.4.1 Per-Flow queuing at the Upstream Bottleneck Link + + When bidirectional traffic exists in a bandwidth asymmetric network + competing ACK and packet data flows along the return path may degrade + the performance of both upstream and downstream flows [KVR98]. + Therefore, it is highly desirable to use a queuing strategy combined + with a scheduling mechanism at the upstream link. This has also been + called priority-based multiplexing [RFC3135]. + + + +Balakrishnan et. al. Best Current Practice [Page 27] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + On a slow upstream link, appreciable jitter may be introduced by + sending large data packets ahead of ACKs [RFC3150]. A simple scheme + may be implemented using per-flow queuing with a fair scheduler + (e.g., round robin service to all flows, or priority scheduling). A + modified scheduler [KVR98] could place a limit on the number of ACKs + a host is allowed to transmit upstream before transmitting a data + packet (assuming at least one data packet is waiting in the upstream + link queue). This guarantees at least a certain minimum share of the + capacity to flows in the reverse direction, while enabling flows in + the forward direction to improve TCP throughput. + + Bulk (payload) compression, a small MTU, link level transparent + fragmentation [RFC1991, RFC2686] or link level suspend/resume + capability (where higher priority frames may pre-empt transmission of + lower priority frames) may be used to mitigate the impact (jitter) of + bidirectional traffic on low speed links [RFC3150]. More advanced + schemes (e.g., WFQ) may also be used to improve the performance of + transfers with multiple ACK streams such as http [Seg00]. + + RECOMMENDATION: Per-flow queuing is a transparent modification + performed at the upstream bottleneck link. Per-flow (or per-class) + scheduling does not impact the congestion behavior of the Internet, + and may be used on any Internet link. The scheme has particular + benefits for slow links. It is widely implemented and widely + deployed on links operating at less than 2 Mbps. This is recommended + as a mitigation on its own or in combination with one of the other + described techniques. + +5.4.2 ACKs-first Scheduling + + ACKs-first Scheduling is an experimental technique to improve + performance of bidirectional transfers. In this case data packets + and ACKs compete for resources at the upstream bottleneck link + [RFC3150]. A single First-In First-Out, FIFO, queue for both data + packets and ACKs could impact the performance of forward transfers. + For example, if the upstream bottleneck link is a 28.8 kbps dialup + line, the transmission of a 1 Kbyte sized data packet would take + about 280 ms. So even if just two such data packets get queued ahead + of ACKs (not an uncommon occurrence since data packets are sent out + in pairs during slow start), they would shut out ACKs for well over + half a second. If more than two data packets are queued up ahead of + an ACK, the ACKs would be delayed by even more [RFC3150]. + + A possible approach to alleviating this is to schedule data and ACKs + differently from FIFO. One algorithm, in particular, is ACKs-first + scheduling, which accords a higher priority to ACKs over data + packets. The motivation for such scheduling is that it minimizes the + idle time for the forward connection by minimizing the time that ACKs + + + +Balakrishnan et. al. Best Current Practice [Page 28] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + spend queued behind data packets at the upstream link. At the same + time, with Type 0 techniques such as header compression [RFC1144], + the transmission time of ACKs becomes small enough that the impact on + subsequent data packets is minimal. (Subnetworks in which the per- + packet overhead of the upstream link is large, e.g., packet radio + subnetworks, are an exception, section 3.2.) This scheduling scheme + does not require the upstream bottleneck router/host to explicitly + identify or maintain state for individual TCP connections. + + ACKs-first scheduling does not help avoid a delay due to a data + packet in transmission. Link fragmentation or suspend/resume may be + beneficial in this case. + + RECOMMENDATION: ACKs-first scheduling is an experimental transparent + modification performed at the upstream bottleneck link. If it is + used without a mechanism (such as ACK Congestion Control (ACC), + section 4.3) to regulate the volume of ACKs, it could lead to + starvation of data packets. This is a performance penalty + experienced by end hosts using the link and does not modify Internet + congestion behavior. Experiments indicate that ACKs-first scheduling + in combination with ACC is promising. However, there is little + experience of using the technique in the wider Internet. Further + development of the technique remains an open research issue, and + therefore the scheme is not currently recommended for use within the + Internet. + +6. Security Considerations + + The recommendations contained in this document do not impact the + integrity of TCP, introduce new security implications to the TCP + protocol, or applications using TCP. + + Some security considerations in the context of this document arise + from the implications of using IPSec by the end hosts or routers + operating along the return path. Use of IPSec prevents, or + complicates, some of the mitigations. For example: + + (i) When IPSec ESP [RFC2406] is used to encrypt the IP payload, the + TCP header can neither be read nor modified by intermediate + entities. This rules out header compression, ACK Filtering, ACK + Reconstruction, and the ACK Compaction. + + (ii) The TCP header information may be visible, when some forms of + network layer security are used. For example, using IPSec AH + [RFC2402], the TCP header may be read, but not modified, by + intermediaries. This may in future allow extensions to support + ACK Filtering, but rules out the generation of new + + + + +Balakrishnan et. al. Best Current Practice [Page 29] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + packets by intermediaries (e.g., ACK Reconstruction). The + enhanced header compression scheme discussed in [RFC2507] would + also work with IPSec AH. + + There are potential Denial-of-Service (DoS) implications when using + Type 2 schemes. Unless additional security mechanisms are used, a + Reconstructor/expander could be exploited as a packet amplifier. A + third party may inject unauthorized Stretch ACKs into the reverse + path, triggering the generation of additional ACKs. These ACKs would + consume capacity on the return path and processing resources at the + systems along the path, including the destination host. This + provides a potential platform for a DoS attack. The usual + precautions must be taken to verify the correct tunnel end point, and + to ensure that applications cannot falsely inject packets that expand + to generate unwanted traffic. Imposing a rate limit and bound on the + delayed ACK factor(d) would also lessen the impact of any undetected + exploitation. + +7. Summary + + This document considers several TCP performance constraints that + arise from asymmetry in the properties of the forward and reverse + paths across an IP network. Such performance constraints arise, + e.g., as a result of both bandwidth (capacity) asymmetry, asymmetric + shared media in the reverse direction, and interactions with Media + Access Control (MAC) protocols. Asymmetric capacity may cause TCP + Acknowledgments (ACKs) to be lost or become inordinately delayed + (e.g., when a bottleneck link is shared between many flows, or when + there is bidirectional traffic). This effect may be exacerbated with + media-access delays (e.g., in certain multi-hop radio subnetworks, + satellite Bandwidth on Demand access). Asymmetry, and particular + high asymmetry, raises a set of TCP performance issues. + + A set of techniques providing performance improvement is surveyed. + These include techniques to alleviate ACK Congestion and techniques + that enable a TCP sender to cope with infrequent ACKs without + destroying TCP self-clocking. These techniques include both end-to- + end, local link-layer, and subnetwork schemes. Many of these + techniques have been evaluated in detail via analysis, simulation, + and/or implementation on asymmetric subnetworks forming part of the + Internet. There is however as yet insufficient operational + experience for some techniques, and these therefore currently remain + items of on-going research and experimentation. + + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 30] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + The following table summarizes the current recommendations. + Mechanisms are classified as recommended (REC), not recommended (NOT + REC) or experimental (EXP). Experimental techniques may not be well + specified. These techniques will require further operational + experience before they can be recommended for use in the public + Internet. + + The recommendations for end-to-end host modifications are summarized + in table 1. This lists each technique, the section in which each + technique is discussed, and where it is applied (S denotes the host + sending TCP data packets in the forward direction, R denotes the host + which receives these data packets). + + +------------------------+-------------+------------+--------+ + | Technique | Use | Section | Where | + +------------------------+-------------+------------+--------+ + | Modified Delayed ACKs | NOT REC | 4.1 | TCP R | + | Large MSS & NO FRAG | REC | 4.2 | TCP SR | + | Large MSS & IP FRAG | NOT REC | 4.2 | TCP SR | + | ACK Congestion Control | EXP | 4.3 | TCP SR | + | Window Pred. Mech (WPM)| NOT REC | 4.4 | TCP R | + | Window Cwnd. Est. (ACE)| NOT REC | 4.5 | TCP R | + | TCP Sender Pacing | EXP *1 | 4.6 | TCP S | + | Byte Counting | NOT REC *2 | 4.7 | TCP S | + | Backpressure | EXP *1 | 4.8 | TCP R | + +------------------------+-------------+------------+--------+ + + Table 1: Recommendations concerning host modifications. + + *1 Implementation of the technique may require changes to the + internal design of the protocol stack in end hosts. + *2 Dependent on a scheme for preventing excessive TCP transmission + burst. + + The recommendations for techniques that do not require the TCP sender + and receiver to be aware of their existence (i.e., transparent + techniques) are summarized in table 2. Each technique is listed + along with the section in which each mechanism is discussed, and + where the technique is applied (S denotes the sending interface prior + to the upstream bottleneck link, R denotes receiving interface + following the upstream bottleneck link). + + + + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 31] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + +------------------------+-------------+------------+--------+ + | Mechanism | Use | Section | Type | + +------------------------+-------------+------------+--------+ + | Header Compr. (V-J) | REC *1 | 5.1.1 | 0 SR | + | Header Compr. (ROHC) | REC *1 *2 | 5.1.2 | 0 SR | + +------------------------+-------------+------------+--------+ + | ACK Filtering (AF) | EXP *3 | 5.2.1 | 1 S | + | ACK Decimation | EXP *3 | 5.2.2 | 1 S | + +------------------------+-------------+------------+--------+ + | ACK Reconstruction (AR)| NOT REC | 5.3.1 | 2 *4 | + | ACK Compaction/Compand.| EXP | 5.3.2 | 2 S *4 | + | Gen. Traff. Shap. (GTS)| REC | 5.3.3 | 2 *5 | + +------------------------+-------------+------------+--------+ + | Fair Queueing (FQ) | REC | 5.4.1 | 3 S | + | ACKs-First Scheduling | NOT REC | 5.4.2 | 3 S | + +------------------------+-------------+------------+--------+ + + Table 2: Recommendations concerning transparent modifications. + + *1 At high asymmetry these schemes may degrade TCP performance, but + are not considered harmful to the Internet. + *2 Standardisation of new TCP compression protocols is the subject of + ongoing work within the ROHC WG, refer to other IETF RFCs on the + use of these techniques. + *3 Use in the Internet is dependent on a scheme for preventing + excessive TCP transmission burst. + *4 Performed at a point along the reverse path after the upstream + bottleneck link. + *5 Performed at a point along the forward path. + +8. Acknowledgments + + This document has benefited from comments from the members of the + Performance Implications of Links (PILC) Working Group. In + particular, the authors would like to thank John Border, Spencer + Dawkins, Aaron Falk, Dan Grossman, Randy Katz, Jeff Mandin, Rod + Ragland, Ramon Segura, Joe Touch, and Lloyd Wood for their useful + comments. They also acknowledge the data provided by Metricom Inc., + concerning operation of their packet data network. + +9. References + + References of the form RFCnnnn are Internet Request for Comments + (RFC) documents available online at http://www.rfc-editor.org/. + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 32] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +9.1 Normative References + + [RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC + 793, September 1981. + + [RFC1122] Braden, R., Ed., "Requirements for Internet Hosts - + Communication Layers", STD 3, RFC 1122, October 1989. + + [RFC1144] Jacobson, V., "Compressing TCP/IP Headers for Low-Speed + Serial Links", RFC 1144, February 1990. + + [RFC1191] Mogul, J. and S. Deering, "Path MTU Discovery", RFC 1191, + November 1990. + + [RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion + Control", RFC 2581, April 1999. + + [RFC2784] Farinacci, D., Li, T., Hanks, S., Meyer, D. and P. Traina, + "Generic Routing Encapsulation (GRE)", RFC 2784, March + 2000. + + [RFC3135] Border, J., Kojo, M., Griner, J., Montenegro, G. and Z. + Shelby, "Performance Enhancing Proxies Intended to Mitigate + Link-Related Degradations", RFC 3135, June 2001. + +9.2 Informative References + + [abc-ID] Allman, M., "TCP Congestion Control with Appropriate Byte + Counting", Work in Progress. + + [All97b] Allman, M., "Fixing Two BSD TCP Bugs", Technical Report + CR-204151, NASA Lewis Research Center, October 1997. + + [ANS01] ANSI Standard T1.413, "Network to Customer Installation + Interfaces - Asymmetric Digital Subscriber Lines (ADSL) + Metallic Interface", November 1998. + + [ASB96] Arora, V., Suphasindhu, N., Baras, J.S. and D. Dillon, + "Asymmetric Internet Access over Satellite-Terrestrial + Networks", Proc. AIAA: 16th International Communications + Satellite Systems Conference and Exhibit, Part 1, + Washington, D.C., February 25-29, 1996, pp.476-482. + + [AST00] Aggarwal, A., Savage, S., and T. Anderson, "Understanding + the Performance of TCP Pacing", Proc. IEEE INFOCOM, Tel- + Aviv, Israel, V.3, March 2000, pp. 1157-1165. + + + + + +Balakrishnan et. al. Best Current Practice [Page 33] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + [Bal98] Balakrishnan, H., "Challenges to Reliable Data Transport + over Heterogeneous Wireless Networks", Ph.D. Thesis, + University of California at Berkeley, USA, August 1998. + http://nms.lcs.mit.edu/papers/hari-phd/ + + [BPK99] Balakrishnan, H., Padmanabhan, V. N., and R. H. Katz, "The + Effects of Asymmetry on TCP Performance", ACM Mobile + Networks and Applications (MONET), Vol.4, No.3, 1999, pp. + 219-241. An expanded version of a paper published at Proc. + ACM/IEEE Mobile Communications Conference (MOBICOM), 1997. + + [BPS00] Bennett, J. C., Partridge, C., and N. Schectman, "Packet + Reordering is Not Pathological Network Behaviour", IEEE/ACM + Transactions on Networking, Vol. 7, Issue. 6, 2000, + pp.789-798. + + [Cla88] Clark, D.D, "The Design Philosophy of the DARPA Internet + Protocols", ACM Computer Communications Review (CCR), Vol. + 18, Issue 4, 1988, pp.106-114. + + [CLC99] Clausen, H., Linder, H., and B. Collini-Nocker, "Internet + over Broadcast Satellites", IEEE Communications Magazine, + Vol. 37, Issue. 6, 1999, pp.146-151. + + [CLP98] Calveras, A., Linares, J., and J. Paradells, "Window + Prediction Mechanism for Improving TCP in Wireless + Asymmetric Links". Proc. IEEE Global Communications + Conference (GLOBECOM), Sydney Australia, November 1998, + pp.533-538. + + [CR98] Cohen, R., and Ramanathan, S., "Tuning TCP for High + Performance in Hybrid Fiber Coaxial Broad-Band Access + Networks", IEEE/ACM Transactions on Networking, Vol.6, + No.1, 1998, pp.15-29. + + [DS00] Cable Television Laboratories, Inc., Data-Over-Cable + Service Interface Specifications---Radio Frequency + Interface Specification SP-RFIv1.1-I04-00407, 2000 + + [DS01] Data-Over-Cable Service Interface Specifications, Radio + Frequency Interface Specification 1.0, SP-RFI-I05-991105, + Cable Television Laboratories, Inc., November 1999. + + [DMT96] Durst, R., Miller, G., and E. Travis, "TCP Extensions for + Space Communications", ACM/IEEE Mobile Communications + Conference (MOBICOM), New York, USA, November 1996, pp.15- + 26. + + + + +Balakrishnan et. al. Best Current Practice [Page 34] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + [EN97] "Digital Video Broadcasting (DVB); DVB Specification for + Data Broadcasting", European Standard (Telecommunications + series) EN 301 192, 1997. + + [EN00] "Digital Video Broadcasting (DVB); Interaction Channel for + Satellite Distribution Systems", Draft European Standard + (Telecommunications series) ETSI, Draft EN 301 790, v.1.2.1 + + [FJ93] Floyd, S., and V. Jacobson, "Random Early Detection + gateways for Congestion Avoidance", IEEE/ACM Transactions + on Networking, Vol.1, No.4, 1993, pp.397-413. + + [FSS01] Fairhurst, G., Samaraweera, N.K.G, Sooriyabandara, M., + Harun, H., Hodson, K., and R. Donardio, "Performance Issues + in Asymmetric Service Provision using Broadband Satellite", + IEE Proceedings on Communication, Vol.148, No.2, 2001, + pp.95-99. + + [ITU01] ITU-T Recommendation E.681, "Traffic Engineering Methods + For IP Access Networks Based on Hybrid Fiber/Coax System", + September 2001. + + [ITU02] ITU-T Recommendation G.992.1, "Asymmetrical Digital + Subscriber Line (ADSL) Transceivers", July 1999. + + [Jac88] Jacobson, V., "Congestion Avoidance and Control", Proc. ACM + SIGCOMM, Stanford, CA, ACM Computer Communications Review + (CCR), Vol.18, No.4, 1988, pp.314-329. + + [Ken87] Kent C.A., and J. C. Mogul, "Fragmentation Considered + Harmful", Proc. ACM SIGCOMM, USA, ACM Computer + Communications Review (CCR), Vol.17, No.5, 1988, pp.390- + 401. + + [KSG98] Krout, T., Solsman, M., and J. Goldstein, "The Effects of + Asymmetric Satellite Networks on Protocols", Proc. IEEE + Military Communications Conference (MILCOM), Bradford, MA, + USA, Vol.3, 1998, pp.1072-1076. + + [KVR98] Kalampoukas, L., Varma, A., and Ramakrishnan, K.K., + "Improving TCP Throughput over Two-Way Asymmetric Links: + Analysis and Solutions", Proc. ACM SIGMETRICS, Medison, + USA, 1998, pp.78-89. + + [LM97] Lin, D., and R. Morris, "Dynamics of Random Early + Detection", Proc. ACM SIGCOMM, Cannes, France, ACM Computer + Communications Review (CCR), Vol.27, No.4, 1997, pp.78-89. + + + + +Balakrishnan et. al. Best Current Practice [Page 35] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + [LMS97] Lakshman, T.V., Madhow, U., and B. Suter, "Window-based + Error Recovery and Flow Control with a Slow Acknowledgement + Channel: A Study of TCP/IP Performance", Proc. IEEE + INFOCOM, Vol.3, Kobe, Japan, 1997, pp.1199-1209. + + [MJW00] Ming-Chit, I.T., Jinsong, D., and W. Wang,"Improving TCP + Performance Over Asymmetric Networks", ACM SIGCOMM, ACM + Computer Communications Review (CCR), Vol.30, No.3, 2000. + + [Pad98] Padmanabhan, V.N., "Addressing the Challenges of Web Data + Transport", Ph.D. Thesis, University of California at + Berkeley, USA, September 1998 (also Tech Report UCB/CSD- + 98-1016). http://www.cs.berkeley.edu/~padmanab/phd- + thesis.html + + [RFC1323] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions for + High Performance", RFC 1323, May 1992. + + [RFC2018] Mathis, B., Mahdavi, J., Floyd, S. and A. Romanow, "TCP + Selective Acknowledgment Options", RFC 2018, October 1996. + + [RFC2402] Kent, S. and R. Atkinson, "IP Authentication Header", RFC + 2402, November 1998. + + [RFC2406] Kent, S. and R. Atkinson, "IP Encapsulating Security + Payload (ESP)", RFC 2406, November 1998. + + [RFC2507] Degermark, M., Nordgren, B. and S. Pink, "IP Header + Compression", RFC 2507, February 1999. + + [RFC2525] Paxson, V., Allman, M., Dawson, S., Heavens, I. and B. + Volz, "Known TCP Implementation Problems", RFC 2525, March + 1999. + + [RFC2686] Bormann, C., "The Multi-Class Extension to Multi-Link PPP", + RFC 2686, September 1999. + + [RFC2760] Allman, M., Dawkins, S., Glover, D., Griner, J., Henderson, + T., Heidemann, J., Kruse, H., Ostermann, S., Scott, K., + Semke, J., Touch, J. and D. Tran, "Ongoing TCP Research + Related to Satellites", RFC 2760, February 2000. + + [RFC2988] Paxson, V. and M. Allman, "Computing TCP's Retransmission + Timer", RFC 2988, November 2000. + + [RFC3077] Duros, E., Dabbous, W., Izumiyama, H., Fujii, N. and Y. + Zhang, "A link Layer tunneling mechanism for unidirectional + links", RFC 3077, March 2001. + + + +Balakrishnan et. al. Best Current Practice [Page 36] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + [RFC3095] Bormann, C., Burmeister, C., Degermark, M., Fukushima, H., + Hannu, H., Jonsson, E., Hakenberg, R., Koren, T., Le, K., + Liu, Z., Martensson, A., Miyazaki, A., Svanbro, K., Wiebke, + T., Yoshimura, T. and H. Zheng, "RObust Header Compression + (ROHC): Framework and four profiles: RTP, UDP ESP and + uncompressed", RFC 3095, July 2001. + + [RFC3150] Dawkins, S., Montenegro, G., Kojo, M. and V. Magret, "End- + to-end Performance Implications of Slow Links", BCP 48, RFC + 3150, July 2001. + + [RFC3168] Ramakrishnan K., Floyd, S. and D. Black, "A Proposal to add + Explicit Congestion Notification (ECN) to IP", RFC 3168, + September 2001. + + [Sam99] Samaraweera, N.K.G, "Return Link Optimization for Internet + Service Provision Using DVB-S Networks", ACM Computer + Communications Review (CCR), Vol.29, No.3, 1999, pp.4-19. + + [Seg00] Segura R., "Asymmetric Networking Techniques For Hybrid + Satellite Communications", NC3A, The Hague, Netherlands, + NATO Technical Note 810, August 2000, pp.32-37. + + [SF98] Samaraweera, N.K.G., and G. Fairhurst. "High Speed Internet + Access using Satellite-based DVB Networks", Proc. IEEE + International Networks Conference (INC98), Plymouth, UK, + 1998, pp.23-28. + + [ZSC91] Zhang, L., Shenker, S., and D. D. Clark, "Observations and + Dynamics of a Congestion Control Algorithm: The Effects of + Two-Way Traffic", Proc. ACM SIGCOMM, ACM Computer + Communications Review (CCR), Vol 21, No 4, 1991, pp.133- + 147. + +10. IANA Considerations + + There are no IANA considerations associated with this document. + + + + + + + + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 37] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +Appendix - Examples of Subnetworks Exhibiting Network Path Asymmetry + + This appendix provides a list of some subnetworks which are known to + experience network path asymmetry. The asymmetry in capacity of + these network paths can require mitigations to provide acceptable + overall performance. Examples include the following: + + - IP service over some wide area and local area wireless networks. + In such networks, the predominant network path asymmetry arises + from the hub-and-spokes architecture of the network (e.g., a + single base station that communicates with multiple mobile + stations), this requires a Ready To Send / Clear To Send (RTS/CTS) + protocol and a Medium Access Control (MAC) protocol which needs to + accommodate the significant turn-around time for the radios. A + high per-packet transmission overhead may lead to significant + network path asymmetry. + + - IP service over a forward satellite link utilizing Digital Video + Broadcast (DVB) transmission [EN97] (e.g., 38-45 Mbps), and a + slower upstream link using terrestrial network technology (e.g., + dial-up modem, line of sight microwave, cellular radio) [CLC99]. + Network path asymmetry arises from a difference in the upstream + and downstream link capacities. + + - Certain military networks [KSG98] providing Internet access to + in-transit or isolated hosts [Seg00] using a high capacity + downstream satellite link (e.g., 2-3 Mbps) with a narrowband + upstream link (e.g., 2.4-9.6 kbps) using either Demand Assigned + Multiple Access (DAMA) or fixed rate satellite links. The main + factor contributing to network path asymmetry is the difference in + the upstream and downstream link capacities. Some differences + between forward and reverse paths may arise from the way in which + upstream link capacity is allocated. + + - Most data over cable TV networks (e.g., DOCSIS [ITU01, DS00]), + where the analogue channels assigned for upstream communication + (i.e., in the reverse direction) are narrower and may be more + noisy than those assigned for the downstream link. As a + consequence, the upstream and downstream links differ in their + transmission rate. For example, in DOCSIS 1.0 [DS00], the + downstream transmission rate is either 27 or 52 Mbps. Upstream + transmission rates may be dynamically selected to be one of a + series of rates which range between 166 kbps to 9 Mbps. Operators + may assign multiple upstream channels per downstream channel. + Physical layer (PHY) overhead (which accompanies upstream + transmissions, but is not present in the downstream link) can also + increase the network path asymmetry. The Best Effort service, + which is typically used to carry TCP, uses a + + + +Balakrishnan et. al. Best Current Practice [Page 38] + +RFC 3449 PILC - Asymmetric Links December 2002 + + + contention/reservation MAC protocol. A cable modem (CM) sending + an isolated packet (such as a TCP ACK) on the upstream link must + contend with other CMs to request capacity from the central cable + modem termination system (CMTS). The CMTS then grants timeslots + to a CM for the upstream transmission. The CM may "piggyback" + subsequent requests onto upstream packets, avoiding contention + cycles; as a result, spacing of TCP ACKs can be dramatically + altered due to minor variations in load of the cable data network + and inter-arrival times of TCP DATA packets. Numerous other + complexities may add to, or mitigate, the asymmetry in rate and + access latency experienced by packets sent on the upstream link + relative to downstream packets in DOCSIS. The asymmetry + experienced by end hosts may also change dynamically (e.g., with + network load), and when best effort services share capacity with + services that have symmetric reserved capacity (e.g., IP telephony + over the Unsolicited Grant service) [ITU01]. + + - Asymmetric Digital Subscriber Line (ADSL), by definition, offers a + downstream link transmission rate that is higher than that of the + upstream link. The available rates depend upon channel quality + and system configuration. For example, one widely deployed ADSL + technology [ITU02, ANS01] operates at rates that are multiples of + 32 kbps (up to 6.144 Mbps) in the downstream link, and up to 640 + kbps for the upstream link. The network path asymmetry + experienced by end hosts may be further increased when best effort + services, e.g., Internet access over ADSL, share the available + upstream capacity with reserved services (e.g., constant bit rate + voice telephony). + + + + + + + + + + + + + + + + + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 39] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +Authors' Addresses + + Hari Balakrishnan + Laboratory for Computer Science + 200 Technology Square + Massachusetts Institute of Technology + Cambridge, MA 02139 + USA + + Phone: +1-617-253-8713 + EMail: hari@lcs.mit.edu + Web: http://nms.lcs.mit.edu/~hari/ + + + Venkata N. Padmanabhan + Microsoft Research + One Microsoft Way + Redmond, WA 98052 + USA + + Phone: +1-425-705-2790 + EMail: padmanab@microsoft.com + Web: http://www.research.microsoft.com/~padmanab/ + + + Godred Fairhurst + Department of Engineering + Fraser Noble Building + University of Aberdeen + Aberdeen AB24 3UE + UK + + EMail: gorry@erg.abdn.ac.uk + Web: http://www.erg.abdn.ac.uk/users/gorry + + + Mahesh Sooriyabandara + Department of Engineering + Fraser Noble Building + University of Aberdeen + Aberdeen AB24 3UE + UK + + EMail: mahesh@erg.abdn.ac.uk + Web: http://www.erg.abdn.ac.uk/users/mahesh + + + + + + +Balakrishnan et. al. Best Current Practice [Page 40] + +RFC 3449 PILC - Asymmetric Links December 2002 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2002). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + + + + + + + + + + + +Balakrishnan et. al. Best Current Practice [Page 41] + diff --git a/ext/picotcp/RFC/rfc3493.txt b/ext/picotcp/RFC/rfc3493.txt new file mode 100644 index 0000000..5fea6c1 --- /dev/null +++ b/ext/picotcp/RFC/rfc3493.txt @@ -0,0 +1,2187 @@ + + + + + + +Network Working Group R. Gilligan +Request for Comments: 3493 Intransa, Inc. +Obsoletes: 2553 S. Thomson +Category: Informational Cisco + J. Bound + J. McCann + Hewlett-Packard + W. Stevens + February 2003 + + + Basic Socket Interface Extensions for IPv6 + +Status of this Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard of any kind. Distribution of this + memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2003). All Rights Reserved. + +Abstract + + The de facto standard Application Program Interface (API) for TCP/IP + applications is the "sockets" interface. Although this API was + developed for Unix in the early 1980s it has also been implemented on + a wide variety of non-Unix systems. TCP/IP applications written + using the sockets API have in the past enjoyed a high degree of + portability and we would like the same portability with IPv6 + applications. But changes are required to the sockets API to support + IPv6 and this memo describes these changes. These include a new + socket address structure to carry IPv6 addresses, new address + conversion functions, and some new socket options. These extensions + are designed to provide access to the basic IPv6 features required by + TCP and UDP applications, including multicasting, while introducing a + minimum of change into the system and providing complete + compatibility for existing IPv4 applications. Additional extensions + for advanced IPv6 features (raw sockets and access to the IPv6 + extension headers) are defined in another document. + + + + + + + + + + +Gilligan, et al. Informational [Page 1] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +Table of Contents + + 1. Introduction................................................3 + 2. Design Considerations.......................................4 + 2.1 What Needs to be Changed...............................4 + 2.2 Data Types.............................................6 + 2.3 Headers................................................6 + 2.4 Structures.............................................6 + 3. Socket Interface............................................6 + 3.1 IPv6 Address Family and Protocol Family................6 + 3.2 IPv6 Address Structure.................................7 + 3.3 Socket Address Structure for 4.3BSD-Based Systems......7 + 3.4 Socket Address Structure for 4.4BSD-Based Systems......9 + 3.5 The Socket Functions...................................9 + 3.6 Compatibility with IPv4 Applications..................10 + 3.7 Compatibility with IPv4 Nodes.........................11 + 3.8 IPv6 Wildcard Address.................................11 + 3.9 IPv6 Loopback Address.................................13 + 3.10 Portability Additions.................................14 + 4. Interface Identification...................................16 + 4.1 Name-to-Index.........................................17 + 4.2 Index-to-Name.........................................17 + 4.3 Return All Interface Names and Indexes................18 + 4.4 Free Memory...........................................18 + 5. Socket Options.............................................18 + 5.1 Unicast Hop Limit.....................................19 + 5.2 Sending and Receiving Multicast Packets...............19 + 5.3 IPV6_V6ONLY option for AF_INET6 Sockets...............22 + 6. Library Functions..........................................22 + 6.1 Protocol-Independent Nodename and + Service Name Translation..............................23 + 6.2 Socket Address Structure to Node Name + and Service Name......................................28 + 6.3 Address Conversion Functions..........................31 + 6.4 Address Testing Macros................................33 + 7. Summary of New Definitions.................................33 + 8. Security Considerations....................................35 + 9. Changes from RFC 2553......................................35 + 10. Acknowledgments............................................36 + 11. References.................................................37 + 12. Authors' Addresses.........................................38 + 13. Full Copyright Statement...................................39 + + + + + + + + + +Gilligan, et al. Informational [Page 2] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +1. Introduction + + While IPv4 addresses are 32 bits long, IPv6 addresses are 128 bits + long. The socket interface makes the size of an IP address quite + visible to an application; virtually all TCP/IP applications for + BSD-based systems have knowledge of the size of an IP address. Those + parts of the API that expose the addresses must be changed to + accommodate the larger IPv6 address size. IPv6 also introduces new + features, some of which must be made visible to applications via the + API. This memo defines a set of extensions to the socket interface + to support the larger address size and new features of IPv6. It + defines "basic" extensions that are of use to a broad range of + applications. A companion document, the "advanced" API [4], covers + extensions that are of use to more specialized applications, examples + of which include routing daemons, and the "ping" and "traceroute" + utilities. + + The development of this API was started in 1994 in the IETF IPng + working group. The API has evolved over the years, published first + in RFC 2133, then again in RFC 2553, and reaching its final form in + this document. + + As the API matured and stabilized, it was incorporated into the Open + Group's Networking Services (XNS) specification, issue 5.2, which was + subsequently incorporated into a joint Open Group/IEEE/ISO standard + [3]. + + Effort has been made to ensure that this document and [3] contain the + same information with regard to the API definitions. However, the + reader should note that this document is for informational purposes + only, and that the official standard specification of the sockets API + is [3]. + + It is expected that any future standardization work on this API would + be done by the Open Group Base Working Group [6]. + + It should also be noted that this document describes only those + portions of the API needed for IPv4 and IPv6 communications. Other + potential uses of the API, for example the use of getaddrinfo() and + getnameinfo() with the AF_UNIX address family, are beyond the scope + of this document. + + + + + + + + + + +Gilligan, et al. Informational [Page 3] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +2. Design Considerations + + There are a number of important considerations in designing changes + to this well-worn API: + + - The API changes should provide both source and binary + compatibility for programs written to the original API. That is, + existing program binaries should continue to operate when run on a + system supporting the new API. In addition, existing applications + that are re-compiled and run on a system supporting the new API + should continue to operate. Simply put, the API changes for IPv6 + should not break existing programs. An additional mechanism for + implementations to verify this is to verify the new symbols are + protected by Feature Test Macros as described in [3]. (Such + Feature Test Macros are not defined by this RFC.) + + - The changes to the API should be as small as possible in order to + simplify the task of converting existing IPv4 applications to + IPv6. + + - Where possible, applications should be able to use this API to + interoperate with both IPv6 and IPv4 hosts. Applications should + not need to know which type of host they are communicating with. + + - IPv6 addresses carried in data structures should be 64-bit + aligned. This is necessary in order to obtain optimum performance + on 64-bit machine architectures. + + Because of the importance of providing IPv4 compatibility in the API, + these extensions are explicitly designed to operate on machines that + provide complete support for both IPv4 and IPv6. A subset of this + API could probably be designed for operation on systems that support + only IPv6. However, this is not addressed in this memo. + +2.1 What Needs to be Changed + + The socket interface API consists of a few distinct components: + + - Core socket functions. + + - Address data structures. + + - Name-to-address translation functions. + + - Address conversion functions. + + + + + + +Gilligan, et al. Informational [Page 4] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + The core socket functions -- those functions that deal with such + things as setting up and tearing down TCP connections, and sending + and receiving UDP packets -- were designed to be transport + independent. Where protocol addresses are passed as function + arguments, they are carried via opaque pointers. A protocol-specific + address data structure is defined for each protocol that the socket + functions support. Applications must cast pointers to these + protocol-specific address structures into pointers to the generic + "sockaddr" address structure when using the socket functions. These + functions need not change for IPv6, but a new IPv6-specific address + data structure is needed. + + The "sockaddr_in" structure is the protocol-specific data structure + for IPv4. This data structure actually includes 8-octets of unused + space, and it is tempting to try to use this space to adapt the + sockaddr_in structure to IPv6. Unfortunately, the sockaddr_in + structure is not large enough to hold the 16-octet IPv6 address as + well as the other information (address family and port number) that + is needed. So a new address data structure must be defined for IPv6. + + IPv6 addresses are scoped [2] so they could be link-local, site, + organization, global, or other scopes at this time undefined. To + support applications that want to be able to identify a set of + interfaces for a specific scope, the IPv6 sockaddr_in structure must + support a field that can be used by an implementation to identify a + set of interfaces identifying the scope for an IPv6 address. + + The IPv4 name-to-address translation functions in the socket + interface are gethostbyname() and gethostbyaddr(). These are left as + is, and new functions are defined which support both IPv4 and IPv6. + + The IPv4 address conversion functions -- inet_ntoa() and inet_addr() + -- convert IPv4 addresses between binary and printable form. These + functions are quite specific to 32-bit IPv4 addresses. We have + designed two analogous functions that convert both IPv4 and IPv6 + addresses, and carry an address type parameter so that they can be + extended to other protocol families as well. + + Finally, a few miscellaneous features are needed to support IPv6. A + new interface is needed to support the IPv6 hop limit header field. + New socket options are needed to control the sending and receiving of + IPv6 multicast packets. + + The socket interface will be enhanced in the future to provide access + to other IPv6 features. Some of these extensions are described in + [4]. + + + + + +Gilligan, et al. Informational [Page 5] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +2.2 Data Types + + The data types of the structure elements given in this memo are + intended to track the relevant standards. uintN_t means an unsigned + integer of exactly N bits (e.g., uint16_t). The sa_family_t and + in_port_t types are defined in [3]. + +2.3 Headers + + When function prototypes and structures are shown we show the headers + that must be #included to cause that item to be defined. + +2.4 Structures + + When structures are described the members shown are the ones that + must appear in an implementation. Additional, nonstandard members + may also be defined by an implementation. As an additional + precaution nonstandard members could be verified by Feature Test + Macros as described in [3]. (Such Feature Test Macros are not + defined by this RFC.) + + The ordering shown for the members of a structure is the recommended + ordering, given alignment considerations of multibyte members, but an + implementation may order the members differently. + +3. Socket Interface + + This section specifies the socket interface changes for IPv6. + +3.1 IPv6 Address Family and Protocol Family + + A new address family name, AF_INET6, is defined in . + The AF_INET6 definition distinguishes between the original + sockaddr_in address data structure, and the new sockaddr_in6 data + structure. + + A new protocol family name, PF_INET6, is defined in . + Like most of the other protocol family names, this will usually be + defined to have the same value as the corresponding address family + name: + + #define PF_INET6 AF_INET6 + + The AF_INET6 is used in the first argument to the socket() function + to indicate that an IPv6 socket is being created. + + + + + + +Gilligan, et al. Informational [Page 6] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +3.2 IPv6 Address Structure + + A new in6_addr structure holds a single IPv6 address and is defined + as a result of including : + + struct in6_addr { + uint8_t s6_addr[16]; /* IPv6 address */ + }; + + This data structure contains an array of sixteen 8-bit elements, + which make up one 128-bit IPv6 address. The IPv6 address is stored + in network byte order. + + The structure in6_addr above is usually implemented with an embedded + union with extra fields that force the desired alignment level in a + manner similar to BSD implementations of "struct in_addr". Those + additional implementation details are omitted here for simplicity. + + An example is as follows: + + struct in6_addr { + union { + uint8_t _S6_u8[16]; + uint32_t _S6_u32[4]; + uint64_t _S6_u64[2]; + } _S6_un; + }; + #define s6_addr _S6_un._S6_u8 + +3.3 Socket Address Structure for 4.3BSD-Based Systems + + In the socket interface, a different protocol-specific data structure + is defined to carry the addresses for each protocol suite. Each + protocol-specific data structure is designed so it can be cast into a + protocol-independent data structure -- the "sockaddr" structure. + Each has a "family" field that overlays the "sa_family" of the + sockaddr data structure. This field identifies the type of the data + structure. + + The sockaddr_in structure is the protocol-specific address data + structure for IPv4. It is used to pass addresses between + applications and the system in the socket functions. The following + sockaddr_in6 structure holds IPv6 addresses and is defined as a + result of including the header: + + + + + + + +Gilligan, et al. Informational [Page 7] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +struct sockaddr_in6 { + sa_family_t sin6_family; /* AF_INET6 */ + in_port_t sin6_port; /* transport layer port # */ + uint32_t sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + uint32_t sin6_scope_id; /* set of interfaces for a scope */ +}; + + This structure is designed to be compatible with the sockaddr data + structure used in the 4.3BSD release. + + The sin6_family field identifies this as a sockaddr_in6 structure. + This field overlays the sa_family field when the buffer is cast to a + sockaddr data structure. The value of this field must be AF_INET6. + + The sin6_port field contains the 16-bit UDP or TCP port number. This + field is used in the same way as the sin_port field of the + sockaddr_in structure. The port number is stored in network byte + order. + + The sin6_flowinfo field is a 32-bit field intended to contain flow- + related information. The exact way this field is mapped to or from a + packet is not currently specified. Until such time as its use is + specified, applications should set this field to zero when + constructing a sockaddr_in6, and ignore this field in a sockaddr_in6 + structure constructed by the system. + + The sin6_addr field is a single in6_addr structure (defined in the + previous section). This field holds one 128-bit IPv6 address. The + address is stored in network byte order. + + The ordering of elements in this structure is specifically designed + so that when sin6_addr field is aligned on a 64-bit boundary, the + start of the structure will also be aligned on a 64-bit boundary. + This is done for optimum performance on 64-bit architectures. + + The sin6_scope_id field is a 32-bit integer that identifies a set of + interfaces as appropriate for the scope [2] of the address carried in + the sin6_addr field. The mapping of sin6_scope_id to an interface or + set of interfaces is left to implementation and future specifications + on the subject of scoped addresses. + + Notice that the sockaddr_in6 structure will normally be larger than + the generic sockaddr structure. On many existing implementations the + sizeof(struct sockaddr_in) equals sizeof(struct sockaddr), with both + being 16 bytes. Any existing code that makes this assumption needs + to be examined carefully when converting to IPv6. + + + + +Gilligan, et al. Informational [Page 8] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +3.4 Socket Address Structure for 4.4BSD-Based Systems + + The 4.4BSD release includes a small, but incompatible change to the + socket interface. The "sa_family" field of the sockaddr data + structure was changed from a 16-bit value to an 8-bit value, and the + space saved used to hold a length field, named "sa_len". The + sockaddr_in6 data structure given in the previous section cannot be + correctly cast into the newer sockaddr data structure. For this + reason, the following alternative IPv6 address data structure is + provided to be used on systems based on 4.4BSD. It is defined as a + result of including the header. + +struct sockaddr_in6 { + uint8_t sin6_len; /* length of this struct */ + sa_family_t sin6_family; /* AF_INET6 */ + in_port_t sin6_port; /* transport layer port # */ + uint32_t sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + uint32_t sin6_scope_id; /* set of interfaces for a scope */ +}; + + The only differences between this data structure and the 4.3BSD + variant are the inclusion of the length field, and the change of the + family field to a 8-bit data type. The definitions of all the other + fields are identical to the structure defined in the previous + section. + + Systems that provide this version of the sockaddr_in6 data structure + must also declare SIN6_LEN as a result of including the + header. This macro allows applications to determine + whether they are being built on a system that supports the 4.3BSD or + 4.4BSD variants of the data structure. + +3.5 The Socket Functions + + Applications call the socket() function to create a socket descriptor + that represents a communication endpoint. The arguments to the + socket() function tell the system which protocol to use, and what + format address structure will be used in subsequent functions. For + example, to create an IPv4/TCP socket, applications make the call: + + s = socket(AF_INET, SOCK_STREAM, 0); + + To create an IPv4/UDP socket, applications make the call: + + s = socket(AF_INET, SOCK_DGRAM, 0); + + + + + +Gilligan, et al. Informational [Page 9] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + Applications may create IPv6/TCP and IPv6/UDP sockets (which may also + handle IPv4 communication as described in section 3.7) by simply + using the constant AF_INET6 instead of AF_INET in the first argument. + For example, to create an IPv6/TCP socket, applications make the + call: + + s = socket(AF_INET6, SOCK_STREAM, 0); + + To create an IPv6/UDP socket, applications make the call: + + s = socket(AF_INET6, SOCK_DGRAM, 0); + + Once the application has created a AF_INET6 socket, it must use the + sockaddr_in6 address structure when passing addresses in to the + system. The functions that the application uses to pass addresses + into the system are: + + bind() + connect() + sendmsg() + sendto() + + The system will use the sockaddr_in6 address structure to return + addresses to applications that are using AF_INET6 sockets. The + functions that return an address from the system to an application + are: + + accept() + recvfrom() + recvmsg() + getpeername() + getsockname() + + No changes to the syntax of the socket functions are needed to + support IPv6, since all of the "address carrying" functions use an + opaque address pointer, and carry an address length as a function + argument. + +3.6 Compatibility with IPv4 Applications + + In order to support the large base of applications using the original + API, system implementations must provide complete source and binary + compatibility with the original API. This means that systems must + continue to support AF_INET sockets and the sockaddr_in address + structure. Applications must be able to create IPv4/TCP and IPv4/UDP + sockets using the AF_INET constant in the socket() function, as + + + + + +Gilligan, et al. Informational [Page 10] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + described in the previous section. Applications should be able to + hold a combination of IPv4/TCP, IPv4/UDP, IPv6/TCP and IPv6/UDP + sockets simultaneously within the same process. + + Applications using the original API should continue to operate as + they did on systems supporting only IPv4. That is, they should + continue to interoperate with IPv4 nodes. + +3.7 Compatibility with IPv4 Nodes + + The API also provides a different type of compatibility: the ability + for IPv6 applications to interoperate with IPv4 applications. This + feature uses the IPv4-mapped IPv6 address format defined in the IPv6 + addressing architecture specification [2]. This address format + allows the IPv4 address of an IPv4 node to be represented as an IPv6 + address. The IPv4 address is encoded into the low-order 32 bits of + the IPv6 address, and the high-order 96 bits hold the fixed prefix + 0:0:0:0:0:FFFF. IPv4-mapped addresses are written as follows: + + ::FFFF: + + These addresses can be generated automatically by the getaddrinfo() + function, as described in Section 6.1. + + Applications may use AF_INET6 sockets to open TCP connections to IPv4 + nodes, or send UDP packets to IPv4 nodes, by simply encoding the + destination's IPv4 address as an IPv4-mapped IPv6 address, and + passing that address, within a sockaddr_in6 structure, in the + connect() or sendto() call. When applications use AF_INET6 sockets + to accept TCP connections from IPv4 nodes, or receive UDP packets + from IPv4 nodes, the system returns the peer's address to the + application in the accept(), recvfrom(), or getpeername() call using + a sockaddr_in6 structure encoded this way. + + Few applications will likely need to know which type of node they are + interoperating with. However, for those applications that do need to + know, the IN6_IS_ADDR_V4MAPPED() macro, defined in Section 6.4, is + provided. + +3.8 IPv6 Wildcard Address + + While the bind() function allows applications to select the source IP + address of UDP packets and TCP connections, applications often want + the system to select the source address for them. With IPv4, one + specifies the address as the symbolic constant INADDR_ANY (called the + "wildcard" address) in the bind() call, or simply omits the bind() + entirely. + + + + +Gilligan, et al. Informational [Page 11] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + Since the IPv6 address type is a structure (struct in6_addr), a + symbolic constant can be used to initialize an IPv6 address variable, + but cannot be used in an assignment. Therefore systems provide the + IPv6 wildcard address in two forms. + + The first version is a global variable named "in6addr_any" that is an + in6_addr structure. The extern declaration for this variable is + defined in : + + extern const struct in6_addr in6addr_any; + + Applications use in6addr_any similarly to the way they use INADDR_ANY + in IPv4. For example, to bind a socket to port number 23, but let + the system select the source address, an application could use the + following code: + + struct sockaddr_in6 sin6; + . . . + sin6.sin6_family = AF_INET6; + sin6.sin6_flowinfo = 0; + sin6.sin6_port = htons(23); + sin6.sin6_addr = in6addr_any; /* structure assignment */ + . . . + if (bind(s, (struct sockaddr *) &sin6, sizeof(sin6)) == -1) + . . . + + The other version is a symbolic constant named IN6ADDR_ANY_INIT and + is defined in . This constant can be used to + initialize an in6_addr structure: + + struct in6_addr anyaddr = IN6ADDR_ANY_INIT; + + Note that this constant can be used ONLY at declaration time. It can + not be used to assign a previously declared in6_addr structure. For + example, the following code will not work: + + /* This is the WRONG way to assign an unspecified address */ + struct sockaddr_in6 sin6; + . . . + sin6.sin6_addr = IN6ADDR_ANY_INIT; /* will NOT compile */ + + Be aware that the IPv4 INADDR_xxx constants are all defined in host + byte order but the IPv6 IN6ADDR_xxx constants and the IPv6 + in6addr_xxx externals are defined in network byte order. + + + + + + + +Gilligan, et al. Informational [Page 12] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +3.9 IPv6 Loopback Address + + Applications may need to send UDP packets to, or originate TCP + connections to, services residing on the local node. In IPv4, they + can do this by using the constant IPv4 address INADDR_LOOPBACK in + their connect(), sendto(), or sendmsg() call. + + IPv6 also provides a loopback address to contact local TCP and UDP + services. Like the unspecified address, the IPv6 loopback address is + provided in two forms -- a global variable and a symbolic constant. + + The global variable is an in6_addr structure named + "in6addr_loopback." The extern declaration for this variable is + defined in : + + extern const struct in6_addr in6addr_loopback; + + Applications use in6addr_loopback as they would use INADDR_LOOPBACK + in IPv4 applications (but beware of the byte ordering difference + mentioned at the end of the previous section). For example, to open + a TCP connection to the local telnet server, an application could use + the following code: + + struct sockaddr_in6 sin6; + . . . + sin6.sin6_family = AF_INET6; + sin6.sin6_flowinfo = 0; + sin6.sin6_port = htons(23); + sin6.sin6_addr = in6addr_loopback; /* structure assignment */ + . . . + if (connect(s, (struct sockaddr *) &sin6, sizeof(sin6)) == -1) + . . . + + The symbolic constant is named IN6ADDR_LOOPBACK_INIT and is defined + in . It can be used at declaration time ONLY; for + example: + + struct in6_addr loopbackaddr = IN6ADDR_LOOPBACK_INIT; + + Like IN6ADDR_ANY_INIT, this constant cannot be used in an assignment + to a previously declared IPv6 address variable. + + + + + + + + + + +Gilligan, et al. Informational [Page 13] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +3.10 Portability Additions + + One simple addition to the sockets API that can help application + writers is the "struct sockaddr_storage". This data structure can + simplify writing code that is portable across multiple address + families and platforms. This data structure is designed with the + following goals. + + - Large enough to accommodate all supported protocol-specific address + structures. + + - Aligned at an appropriate boundary so that pointers to it can be + cast as pointers to protocol specific address structures and used + to access the fields of those structures without alignment + problems. + + The sockaddr_storage structure contains field ss_family which is of + type sa_family_t. When a sockaddr_storage structure is cast to a + sockaddr structure, the ss_family field of the sockaddr_storage + structure maps onto the sa_family field of the sockaddr structure. + When a sockaddr_storage structure is cast as a protocol specific + address structure, the ss_family field maps onto a field of that + structure that is of type sa_family_t and that identifies the + protocol's address family. + + + + + + + + + + + + + + + + + + + + + + + + + + + +Gilligan, et al. Informational [Page 14] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + An example implementation design of such a data structure would be as + follows. + +/* + * Desired design of maximum size and alignment + */ +#define _SS_MAXSIZE 128 /* Implementation specific max size */ +#define _SS_ALIGNSIZE (sizeof (int64_t)) + /* Implementation specific desired alignment */ +/* + * Definitions used for sockaddr_storage structure paddings design. + */ +#define _SS_PAD1SIZE (_SS_ALIGNSIZE - sizeof (sa_family_t)) +#define _SS_PAD2SIZE (_SS_MAXSIZE - (sizeof (sa_family_t) + + _SS_PAD1SIZE + _SS_ALIGNSIZE)) +struct sockaddr_storage { + sa_family_t ss_family; /* address family */ + /* Following fields are implementation specific */ + char __ss_pad1[_SS_PAD1SIZE]; + /* 6 byte pad, this is to make implementation + /* specific pad up to alignment field that */ + /* follows explicit in the data structure */ + int64_t __ss_align; /* field to force desired structure */ + /* storage alignment */ + char __ss_pad2[_SS_PAD2SIZE]; + /* 112 byte pad to achieve desired size, */ + /* _SS_MAXSIZE value minus size of ss_family */ + /* __ss_pad1, __ss_align fields is 112 */ +}; + + The above example implementation illustrates a data structure which + will align on a 64-bit boundary. An implementation-specific field + "__ss_align" along with "__ss_pad1" is used to force a 64-bit + alignment which covers proper alignment good enough for the needs of + sockaddr_in6 (IPv6), sockaddr_in (IPv4) address data structures. The + size of padding field __ss_pad1 depends on the chosen alignment + boundary. The size of padding field __ss_pad2 depends on the value + of overall size chosen for the total size of the structure. This + size and alignment are represented in the above example by + implementation specific (not required) constants _SS_MAXSIZE (chosen + value 128) and _SS_ALIGNSIZE (with chosen value 8). Constants + _SS_PAD1SIZE (derived value 6) and _SS_PAD2SIZE (derived value 112) + are also for illustration and not required. The derived values + assume sa_family_t is 2 bytes. The implementation specific + definitions and structure field names above start with an underscore + to denote implementation private namespace. Portable code is not + expected to access or reference those fields or constants. + + + + +Gilligan, et al. Informational [Page 15] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + On implementations where the sockaddr data structure includes a + "sa_len" field this data structure would look like this: + +/* + * Definitions used for sockaddr_storage structure paddings design. + */ +#define _SS_PAD1SIZE (_SS_ALIGNSIZE - + (sizeof (uint8_t) + sizeof (sa_family_t)) +#define _SS_PAD2SIZE (_SS_MAXSIZE - + (sizeof (uint8_t) + sizeof (sa_family_t) + + _SS_PAD1SIZE + _SS_ALIGNSIZE)) +struct sockaddr_storage { + uint8_t ss_len; /* address length */ + sa_family_t ss_family; /* address family */ + /* Following fields are implementation specific */ + char __ss_pad1[_SS_PAD1SIZE]; + /* 6 byte pad, this is to make implementation + /* specific pad up to alignment field that */ + /* follows explicit in the data structure */ + int64_t __ss_align; /* field to force desired structure */ + /* storage alignment */ + char __ss_pad2[_SS_PAD2SIZE]; + /* 112 byte pad to achieve desired size, */ + /* _SS_MAXSIZE value minus size of ss_len, */ + /* __ss_family, __ss_pad1, __ss_align fields is 112 */ +}; + +4. Interface Identification + + This API uses an interface index (a small positive integer) to + identify the local interface on which a multicast group is joined + (Section 5.2). Additionally, the advanced API [4] uses these same + interface indexes to identify the interface on which a datagram is + received, or to specify the interface on which a datagram is to be + sent. + + Interfaces are normally known by names such as "le0", "sl1", "ppp2", + and the like. On Berkeley-derived implementations, when an interface + is made known to the system, the kernel assigns a unique positive + integer value (called the interface index) to that interface. These + are small positive integers that start at 1. (Note that 0 is never + used for an interface index.) There may be gaps so that there is no + current interface for a particular positive interface index. + + This API defines two functions that map between an interface name and + index, a third function that returns all the interface names and + indexes, and a fourth function to return the dynamic memory allocated + by the previous function. How these functions are implemented is + + + +Gilligan, et al. Informational [Page 16] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + left up to the implementation. 4.4BSD implementations can implement + these functions using the existing sysctl() function with the + NET_RT_IFLIST command. Other implementations may wish to use ioctl() + for this purpose. + +4.1 Name-to-Index + + The first function maps an interface name into its corresponding + index. + + #include + + unsigned int if_nametoindex(const char *ifname); + + If ifname is the name of an interface, the if_nametoindex() function + shall return the interface index corresponding to name ifname; + otherwise, it shall return zero. No errors are defined. + +4.2 Index-to-Name + + The second function maps an interface index into its corresponding + name. + + #include + + char *if_indextoname(unsigned int ifindex, char *ifname); + + When this function is called, the ifname argument shall point to a + buffer of at least IF_NAMESIZE bytes. The function shall place in + this buffer the name of the interface with index ifindex. + (IF_NAMESIZE is also defined in and its value includes a + terminating null byte at the end of the interface name.) If ifindex + is an interface index, then the function shall return the value + supplied in ifname, which points to a buffer now containing the + interface name. Otherwise, the function shall return a NULL pointer + and set errno to indicate the error. If there is no interface + corresponding to the specified index, errno is set to ENXIO. If + there was a system error (such as running out of memory), errno would + be set to the proper value (e.g., ENOMEM). + + + + + + + + + + + + +Gilligan, et al. Informational [Page 17] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +4.3 Return All Interface Names and Indexes + + The if_nameindex structure holds the information about a single + interface and is defined as a result of including the + header. + + struct if_nameindex { + unsigned int if_index; /* 1, 2, ... */ + char *if_name; /* null terminated name: "le0", ... */ + }; + + The final function returns an array of if_nameindex structures, one + structure per interface. + + #include + + struct if_nameindex *if_nameindex(void); + + The end of the array of structures is indicated by a structure with + an if_index of 0 and an if_name of NULL. The function returns a NULL + pointer upon an error, and would set errno to the appropriate value. + + The memory used for this array of structures along with the interface + names pointed to by the if_name members is obtained dynamically. + This memory is freed by the next function. + +4.4 Free Memory + + The following function frees the dynamic memory that was allocated by + if_nameindex(). + + #include + + void if_freenameindex(struct if_nameindex *ptr); + + The ptr argument shall be a pointer that was returned by + if_nameindex(). After if_freenameindex() has been called, the + application shall not use the array of which ptr is the address. + +5. Socket Options + + A number of new socket options are defined for IPv6. All of these + new options are at the IPPROTO_IPV6 level. That is, the "level" + parameter in the getsockopt() and setsockopt() calls is IPPROTO_IPV6 + when using these options. The constant name prefix IPV6_ is used in + all of the new socket options. This serves to clearly identify these + options as applying to IPv6. + + + + +Gilligan, et al. Informational [Page 18] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + The declaration for IPPROTO_IPV6, the new IPv6 socket options, and + related constants defined in this section are obtained by including + the header . + +5.1 Unicast Hop Limit + + A new setsockopt() option controls the hop limit used in outgoing + unicast IPv6 packets. The name of this option is IPV6_UNICAST_HOPS, + and it is used at the IPPROTO_IPV6 layer. The following example + illustrates how it is used: + + int hoplimit = 10; + + if (setsockopt(s, IPPROTO_IPV6, IPV6_UNICAST_HOPS, + (char *) &hoplimit, sizeof(hoplimit)) == -1) + perror("setsockopt IPV6_UNICAST_HOPS"); + + When the IPV6_UNICAST_HOPS option is set with setsockopt(), the + option value given is used as the hop limit for all subsequent + unicast packets sent via that socket. If the option is not set, the + system selects a default value. The integer hop limit value (called + x) is interpreted as follows: + + x < -1: return an error of EINVAL + x == -1: use kernel default + 0 <= x <= 255: use x + x >= 256: return an error of EINVAL + + The IPV6_UNICAST_HOPS option may be used with getsockopt() to + determine the hop limit value that the system will use for subsequent + unicast packets sent via that socket. For example: + + int hoplimit; + socklen_t len = sizeof(hoplimit); + + if (getsockopt(s, IPPROTO_IPV6, IPV6_UNICAST_HOPS, + (char *) &hoplimit, &len) == -1) + perror("getsockopt IPV6_UNICAST_HOPS"); + else + printf("Using %d for hop limit.\n", hoplimit); + +5.2 Sending and Receiving Multicast Packets + + IPv6 applications may send multicast packets by simply specifying an + IPv6 multicast address as the destination address, for example in the + destination address argument of the sendto() function. + + + + + +Gilligan, et al. Informational [Page 19] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + Three socket options at the IPPROTO_IPV6 layer control some of the + parameters for sending multicast packets. Setting these options is + not required: applications may send multicast packets without using + these options. The setsockopt() options for controlling the sending + of multicast packets are summarized below. These three options can + also be used with getsockopt(). + + IPV6_MULTICAST_IF + + Set the interface to use for outgoing multicast packets. The + argument is the index of the interface to use. If the + interface index is specified as zero, the system selects the + interface (for example, by looking up the address in a routing + table and using the resulting interface). + + Argument type: unsigned int + + IPV6_MULTICAST_HOPS + + Set the hop limit to use for outgoing multicast packets. (Note + a separate option - IPV6_UNICAST_HOPS - is provided to set the + hop limit to use for outgoing unicast packets.) + + The interpretation of the argument is the same as for the + IPV6_UNICAST_HOPS option: + + x < -1: return an error of EINVAL + x == -1: use kernel default + 0 <= x <= 255: use x + x >= 256: return an error of EINVAL + + If IPV6_MULTICAST_HOPS is not set, the default is 1 + (same as IPv4 today) + + Argument type: int + + IPV6_MULTICAST_LOOP + + If a multicast datagram is sent to a group to which the sending + host itself belongs (on the outgoing interface), a copy of the + datagram is looped back by the IP layer for local delivery if + this option is set to 1. If this option is set to 0 a copy is + not looped back. Other option values return an error of + EINVAL. + + + + + + + +Gilligan, et al. Informational [Page 20] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + If IPV6_MULTICAST_LOOP is not set, the default is 1 (loopback; + same as IPv4 today). + + Argument type: unsigned int + + The reception of multicast packets is controlled by the two + setsockopt() options summarized below. An error of EOPNOTSUPP is + returned if these two options are used with getsockopt(). + + IPV6_JOIN_GROUP + + Join a multicast group on a specified local interface. + If the interface index is specified as 0, + the kernel chooses the local interface. + For example, some kernels look up the multicast group + in the normal IPv6 routing table and use the resulting + interface. + + Argument type: struct ipv6_mreq + + IPV6_LEAVE_GROUP + + Leave a multicast group on a specified interface. + If the interface index is specified as 0, the system + may choose a multicast group membership to drop by + matching the multicast address only. + + Argument type: struct ipv6_mreq + + The argument type of both of these options is the ipv6_mreq + structure, defined as a result of including the + header; + + struct ipv6_mreq { + struct in6_addr ipv6mr_multiaddr; /* IPv6 multicast addr */ + unsigned int ipv6mr_interface; /* interface index */ + }; + + Note that to receive multicast datagrams a process must join the + multicast group to which datagrams will be sent. UDP applications + must also bind the UDP port to which datagrams will be sent. Some + processes also bind the multicast group address to the socket, in + addition to the port, to prevent other datagrams destined to that + same port from being delivered to the socket. + + + + + + + +Gilligan, et al. Informational [Page 21] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +5.3 IPV6_V6ONLY option for AF_INET6 Sockets + + This socket option restricts AF_INET6 sockets to IPv6 communications + only. As stated in section <3.7 Compatibility with IPv4 Nodes>, + AF_INET6 sockets may be used for both IPv4 and IPv6 communications. + Some applications may want to restrict their use of an AF_INET6 + socket to IPv6 communications only. For these applications the + IPV6_V6ONLY socket option is defined. When this option is turned on, + the socket can be used to send and receive IPv6 packets only. This + is an IPPROTO_IPV6 level option. This option takes an int value. + This is a boolean option. By default this option is turned off. + + Here is an example of setting this option: + + int on = 1; + + if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, + (char *)&on, sizeof(on)) == -1) + perror("setsockopt IPV6_V6ONLY"); + else + printf("IPV6_V6ONLY set\n"); + + Note - This option has no effect on the use of IPv4 Mapped addresses + which enter a node as a valid IPv6 addresses for IPv6 communications + as defined by Stateless IP/ICMP Translation Algorithm (SIIT) [5]. + + An example use of this option is to allow two versions of the same + server process to run on the same port, one providing service over + IPv6, the other providing the same service over IPv4. + +6. Library Functions + + New library functions are needed to perform a variety of operations + with IPv6 addresses. Functions are needed to lookup IPv6 addresses + in the Domain Name System (DNS). Both forward lookup (nodename-to- + address translation) and reverse lookup (address-to-nodename + translation) need to be supported. Functions are also needed to + convert IPv6 addresses between their binary and textual form. + + We note that the two existing functions, gethostbyname() and + gethostbyaddr(), are left as-is. New functions are defined to handle + both IPv4 and IPv6 addresses. + + The commonly used function gethostbyname() is inadequate for many + applications, first because it provides no way for the caller to + specify anything about the types of addresses desired (IPv4 only, + IPv6 only, IPv4-mapped IPv6 are OK, etc.), and second because many + implementations of this function are not thread safe. RFC 2133 + + + +Gilligan, et al. Informational [Page 22] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + defined a function named gethostbyname2() but this function was also + inadequate, first because its use required setting a global option + (RES_USE_INET6) when IPv6 addresses were required, and second because + a flag argument is needed to provide the caller with additional + control over the types of addresses required. The gethostbyname2() + function was deprecated in RFC 2553 and is no longer part of the + basic API. + +6.1 Protocol-Independent Nodename and Service Name Translation + + Nodename-to-address translation is done in a protocol-independent + fashion using the getaddrinfo() function. + +#include +#include + + +int getaddrinfo(const char *nodename, const char *servname, + const struct addrinfo *hints, struct addrinfo **res); + +void freeaddrinfo(struct addrinfo *ai); + +struct addrinfo { + int ai_flags; /* AI_PASSIVE, AI_CANONNAME, + AI_NUMERICHOST, .. */ + int ai_family; /* AF_xxx */ + int ai_socktype; /* SOCK_xxx */ + int ai_protocol; /* 0 or IPPROTO_xxx for IPv4 and IPv6 */ + socklen_t ai_addrlen; /* length of ai_addr */ + char *ai_canonname; /* canonical name for nodename */ + struct sockaddr *ai_addr; /* binary address */ + struct addrinfo *ai_next; /* next structure in linked list */ +}; + + The getaddrinfo() function translates the name of a service location + (for example, a host name) and/or a service name and returns a set of + socket addresses and associated information to be used in creating a + socket with which to address the specified service. + + The nodename and servname arguments are either null pointers or + pointers to null-terminated strings. One or both of these two + arguments must be a non-null pointer. + + The format of a valid name depends on the address family or families. + If a specific family is not given and the name could be interpreted + as valid within multiple supported families, the implementation will + attempt to resolve the name in all supported families and, in absence + of errors, one or more results shall be returned. + + + +Gilligan, et al. Informational [Page 23] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + If the nodename argument is not null, it can be a descriptive name or + can be an address string. If the specified address family is + AF_INET, AF_INET6, or AF_UNSPEC, valid descriptive names include host + names. If the specified address family is AF_INET or AF_UNSPEC, + address strings using Internet standard dot notation as specified in + inet_addr() are valid. If the specified address family is AF_INET6 + or AF_UNSPEC, standard IPv6 text forms described in inet_pton() are + valid. + + If nodename is not null, the requested service location is named by + nodename; otherwise, the requested service location is local to the + caller. + + If servname is null, the call shall return network-level addresses + for the specified nodename. If servname is not null, it is a null- + terminated character string identifying the requested service. This + can be either a descriptive name or a numeric representation suitable + for use with the address family or families. If the specified + address family is AF_INET, AF_INET6 or AF_UNSPEC, the service can be + specified as a string specifying a decimal port number. + + If the argument hints is not null, it refers to a structure + containing input values that may direct the operation by providing + options and by limiting the returned information to a specific socket + type, address family and/or protocol. In this hints structure every + member other than ai_flags, ai_family, ai_socktype and ai_protocol + shall be set to zero or a null pointer. A value of AF_UNSPEC for + ai_family means that the caller shall accept any address family. A + value of zero for ai_socktype means that the caller shall accept any + socket type. A value of zero for ai_protocol means that the caller + shall accept any protocol. If hints is a null pointer, the behavior + shall be as if it referred to a structure containing the value zero + for the ai_flags, ai_socktype and ai_protocol fields, and AF_UNSPEC + for the ai_family field. + + Note: + + 1. If the caller handles only TCP and not UDP, for example, then the + ai_protocol member of the hints structure should be set to + IPPROTO_TCP when getaddrinfo() is called. + + 2. If the caller handles only IPv4 and not IPv6, then the ai_family + member of the hints structure should be set to AF_INET when + getaddrinfo() is called. + + + + + + + +Gilligan, et al. Informational [Page 24] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + The ai_flags field to which hints parameter points shall be set to + zero or be the bitwise-inclusive OR of one or more of the values + AI_PASSIVE, AI_CANONNAME, AI_NUMERICHOST, AI_NUMERICSERV, + AI_V4MAPPED, AI_ALL, and AI_ADDRCONFIG. + + If the AI_PASSIVE flag is specified, the returned address information + shall be suitable for use in binding a socket for accepting incoming + connections for the specified service (i.e., a call to bind()). In + this case, if the nodename argument is null, then the IP address + portion of the socket address structure shall be set to INADDR_ANY + for an IPv4 address or IN6ADDR_ANY_INIT for an IPv6 address. If the + AI_PASSIVE flag is not specified, the returned address information + shall be suitable for a call to connect() (for a connection-mode + protocol) or for a call to connect(), sendto() or sendmsg() (for a + connectionless protocol). In this case, if the nodename argument is + null, then the IP address portion of the socket address structure + shall be set to the loopback address. This flag is ignored if the + nodename argument is not null. + + If the AI_CANONNAME flag is specified and the nodename argument is + not null, the function shall attempt to determine the canonical name + corresponding to nodename (for example, if nodename is an alias or + shorthand notation for a complete name). + + If the AI_NUMERICHOST flag is specified, then a non-null nodename + string supplied shall be a numeric host address string. Otherwise, + an [EAI_NONAME] error is returned. This flag shall prevent any type + of name resolution service (for example, the DNS) from being invoked. + + If the AI_NUMERICSERV flag is specified, then a non-null servname + string supplied shall be a numeric port string. Otherwise, an + [EAI_NONAME] error shall be returned. This flag shall prevent any + type of name resolution service (for example, NIS+) from being + invoked. + + If the AI_V4MAPPED flag is specified along with an ai_family of + AF_INET6, then getaddrinfo() shall return IPv4-mapped IPv6 addresses + on finding no matching IPv6 addresses (ai_addrlen shall be 16). + + For example, when using the DNS, if no AAAA records are found then + a query is made for A records and any found are returned as IPv4- + mapped IPv6 addresses. + + The AI_V4MAPPED flag shall be ignored unless ai_family equals + AF_INET6. + + If the AI_ALL flag is used with the AI_V4MAPPED flag, then + getaddrinfo() shall return all matching IPv6 and IPv4 addresses. + + + +Gilligan, et al. Informational [Page 25] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + For example, when using the DNS, queries are made for both AAAA + records and A records, and getaddrinfo() returns the combined + results of both queries. Any IPv4 addresses found are returned as + IPv4-mapped IPv6 addresses. + + The AI_ALL flag without the AI_V4MAPPED flag is ignored. + + Note: + + When ai_family is not specified (AF_UNSPEC), AI_V4MAPPED and + AI_ALL flags will only be used if AF_INET6 is supported. + + If the AI_ADDRCONFIG flag is specified, IPv4 addresses shall be + returned only if an IPv4 address is configured on the local system, + and IPv6 addresses shall be returned only if an IPv6 address is + configured on the local system. The loopback address is not + considered for this case as valid as a configured address. + + For example, when using the DNS, a query for AAAA records should + occur only if the node has at least one IPv6 address configured + (other than IPv6 loopback) and a query for A records should occur + only if the node has at least one IPv4 address configured (other + than the IPv4 loopback). + + The ai_socktype field to which argument hints points specifies the + socket type for the service, as defined for socket(). If a specific + socket type is not given (for example, a value of zero) and the + service name could be interpreted as valid with multiple supported + socket types, the implementation shall attempt to resolve the service + name for all supported socket types and, in the absence of errors, + all possible results shall be returned. A non-zero socket type value + shall limit the returned information to values with the specified + socket type. + + If the ai_family field to which hints points has the value AF_UNSPEC, + addresses shall be returned for use with any address family that can + be used with the specified nodename and/or servname. Otherwise, + addresses shall be returned for use only with the specified address + family. If ai_family is not AF_UNSPEC and ai_protocol is not zero, + then addresses are returned for use only with the specified address + family and protocol; the value of ai_protocol shall be interpreted as + in a call to the socket() function with the corresponding values of + ai_family and ai_protocol. + + The freeaddrinfo() function frees one or more addrinfo structures + returned by getaddrinfo(), along with any additional storage + associated with those structures (for example, storage pointed to by + the ai_canonname and ai_addr fields; an application must not + + + +Gilligan, et al. Informational [Page 26] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + reference this storage after the associated addrinfo structure has + been freed). If the ai_next field of the structure is not null, the + entire list of structures is freed. The freeaddrinfo() function must + support the freeing of arbitrary sublists of an addrinfo list + originally returned by getaddrinfo(). + + Functions getaddrinfo() and freeaddrinfo() must be thread-safe. + + A zero return value for getaddrinfo() indicates successful + completion; a non-zero return value indicates failure. The possible + values for the failures are listed below under Error Return Values. + + Upon successful return of getaddrinfo(), the location to which res + points shall refer to a linked list of addrinfo structures, each of + which shall specify a socket address and information for use in + creating a socket with which to use that socket address. The list + shall include at least one addrinfo structure. The ai_next field of + each structure contains a pointer to the next structure on the list, + or a null pointer if it is the last structure on the list. Each + structure on the list shall include values for use with a call to the + socket() function, and a socket address for use with the connect() + function or, if the AI_PASSIVE flag was specified, for use with the + bind() function. The fields ai_family, ai_socktype, and ai_protocol + shall be usable as the arguments to the socket() function to create a + socket suitable for use with the returned address. The fields + ai_addr and ai_addrlen are usable as the arguments to the connect() + or bind() functions with such a socket, according to the AI_PASSIVE + flag. + + If nodename is not null, and if requested by the AI_CANONNAME flag, + the ai_canonname field of the first returned addrinfo structure shall + point to a null-terminated string containing the canonical name + corresponding to the input nodename; if the canonical name is not + available, then ai_canonname shall refer to the nodename argument or + a string with the same contents. The contents of the ai_flags field + of the returned structures are undefined. + + All fields in socket address structures returned by getaddrinfo() + that are not filled in through an explicit argument (for example, + sin6_flowinfo) shall be set to zero. + + Note: This makes it easier to compare socket address structures. + + + + + + + + + +Gilligan, et al. Informational [Page 27] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + Error Return Values: + + The getaddrinfo() function shall fail and return the corresponding + value if: + + [EAI_AGAIN] The name could not be resolved at this time. Future + attempts may succeed. + + [EAI_BADFLAGS] The flags parameter had an invalid value. + + [EAI_FAIL] A non-recoverable error occurred when attempting to + resolve the name. + + [EAI_FAMILY] The address family was not recognized. + + [EAI_MEMORY] There was a memory allocation failure when trying to + allocate storage for the return value. + + [EAI_NONAME] The name does not resolve for the supplied + parameters. Neither nodename nor servname were + supplied. At least one of these must be supplied. + + [EAI_SERVICE] The service passed was not recognized for the + specified socket type. + + [EAI_SOCKTYPE] The intended socket type was not recognized. + + [EAI_SYSTEM] A system error occurred; the error code can be found + in errno. + + The gai_strerror() function provides a descriptive text string + corresponding to an EAI_xxx error value. + + #include + + const char *gai_strerror(int ecode); + + The argument is one of the EAI_xxx values defined for the + getaddrinfo() and getnameinfo() functions. The return value points + to a string describing the error. If the argument is not one of the + EAI_xxx values, the function still returns a pointer to a string + whose contents indicate an unknown error. + +6.2 Socket Address Structure to Node Name and Service Name + + The getnameinfo() function is used to translate the contents of a + socket address structure to a node name and/or service name. + + + + +Gilligan, et al. Informational [Page 28] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + #include + #include + + int getnameinfo(const struct sockaddr *sa, socklen_t salen, + char *node, socklen_t nodelen, + char *service, socklen_t servicelen, + int flags); + + The getnameinfo() function shall translate a socket address to a node + name and service location, all of which are defined as in + getaddrinfo(). + + The sa argument points to a socket address structure to be + translated. + + The salen argument holds the size of the socket address structure + pointed to by sa. + + If the socket address structure contains an IPv4-mapped IPv6 address + or an IPv4-compatible IPv6 address, the implementation shall extract + the embedded IPv4 address and lookup the node name for that IPv4 + address. + + Note: The IPv6 unspecified address ("::") and the IPv6 loopback + address ("::1") are not IPv4-compatible addresses. If the address + is the IPv6 unspecified address ("::"), a lookup is not performed, + and the [EAI_NONAME] error is returned. + + If the node argument is non-NULL and the nodelen argument is nonzero, + then the node argument points to a buffer able to contain up to + nodelen characters that receives the node name as a null-terminated + string. If the node argument is NULL or the nodelen argument is + zero, the node name shall not be returned. If the node's name cannot + be located, the numeric form of the node's address is returned + instead of its name. + + If the service argument is non-NULL and the servicelen argument is + non-zero, then the service argument points to a buffer able to + contain up to servicelen bytes that receives the service name as a + null-terminated string. If the service argument is NULL or the + servicelen argument is zero, the service name shall not be returned. + If the service's name cannot be located, the numeric form of the + service address (for example, its port number) shall be returned + instead of its name. + + The arguments node and service cannot both be NULL. + + + + + +Gilligan, et al. Informational [Page 29] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + The flags argument is a flag that changes the default actions of the + function. By default the fully-qualified domain name (FQDN) for the + host shall be returned, but: + + - If the flag bit NI_NOFQDN is set, only the node name portion of + the FQDN shall be returned for local hosts. + + - If the flag bit NI_NUMERICHOST is set, the numeric form of the + host's address shall be returned instead of its name, under all + circumstances. + + - If the flag bit NI_NAMEREQD is set, an error shall be returned if + the host's name cannot be located. + + - If the flag bit NI_NUMERICSERV is set, the numeric form of the + service address shall be returned (for example, its port number) + instead of its name, under all circumstances. + + - If the flag bit NI_DGRAM is set, this indicates that the service + is a datagram service (SOCK_DGRAM). The default behavior shall + assume that the service is a stream service (SOCK_STREAM). + + Note: + + 1. The NI_NUMERICxxx flags are required to support the "-n" flags + that many commands provide. + + 2. The NI_DGRAM flag is required for the few AF_INET and AF_INET6 + port numbers (for example, [512,514]) that represent different + services for UDP and TCP. + + The getnameinfo() function shall be thread safe. + + A zero return value for getnameinfo() indicates successful + completion; a non-zero return value indicates failure. + + Upon successful completion, getnameinfo() shall return the node and + service names, if requested, in the buffers provided. The returned + names are always null-terminated strings. + + + + + + + + + + + + +Gilligan, et al. Informational [Page 30] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + Error Return Values: + + The getnameinfo() function shall fail and return the corresponding + value if: + + [EAI_AGAIN] The name could not be resolved at this time. + Future attempts may succeed. + + [EAI_BADFLAGS] The flags had an invalid value. + + [EAI_FAIL] A non-recoverable error occurred. + + [EAI_FAMILY] The address family was not recognized or the address + length was invalid for the specified family. + + [EAI_MEMORY] There was a memory allocation failure. + + [EAI_NONAME] The name does not resolve for the supplied parameters. + NI_NAMEREQD is set and the host's name cannot be + located, or both nodename and servname were null. + + [EAI_OVERFLOW] An argument buffer overflowed. + + [EAI_SYSTEM] A system error occurred. The error code can be found + in errno. + +6.3 Address Conversion Functions + + The two IPv4 functions inet_addr() and inet_ntoa() convert an IPv4 + address between binary and text form. IPv6 applications need similar + functions. The following two functions convert both IPv6 and IPv4 + addresses: + + #include + + int inet_pton(int af, const char *src, void *dst); + + const char *inet_ntop(int af, const void *src, + char *dst, socklen_t size); + + The inet_pton() function shall convert an address in its standard + text presentation form into its numeric binary form. The af argument + shall specify the family of the address. The AF_INET and AF_INET6 + address families shall be supported. The src argument points to the + string being passed in. The dst argument points to a buffer into + which the function stores the numeric address; this shall be large + enough to hold the numeric address (32 bits for AF_INET, 128 bits for + AF_INET6). The inet_pton() function shall return 1 if the conversion + + + +Gilligan, et al. Informational [Page 31] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + succeeds, with the address pointed to by dst in network byte order. + It shall return 0 if the input is not a valid IPv4 dotted-decimal + string or a valid IPv6 address string, or -1 with errno set to + EAFNOSUPPORT if the af argument is unknown. + + If the af argument of inet_pton() is AF_INET, the src string shall be + in the standard IPv4 dotted-decimal form: + + ddd.ddd.ddd.ddd + + where "ddd" is a one to three digit decimal number between 0 and 255. + The inet_pton() function does not accept other formats (such as the + octal numbers, hexadecimal numbers, and fewer than four numbers that + inet_addr() accepts). + + If the af argument of inet_pton() is AF_INET6, the src string shall + be in one of the standard IPv6 text forms defined in Section 2.2 of + the addressing architecture specification [2]. + + The inet_ntop() function shall convert a numeric address into a text + string suitable for presentation. The af argument shall specify the + family of the address. This can be AF_INET or AF_INET6. The src + argument points to a buffer holding an IPv4 address if the af + argument is AF_INET, or an IPv6 address if the af argument is + AF_INET6; the address must be in network byte order. The dst + argument points to a buffer where the function stores the resulting + text string; it shall not be NULL. The size argument specifies the + size of this buffer, which shall be large enough to hold the text + string (INET_ADDRSTRLEN characters for IPv4, INET6_ADDRSTRLEN + characters for IPv6). + + In order to allow applications to easily declare buffers of the + proper size to store IPv4 and IPv6 addresses in string form, the + following two constants are defined in : + + #define INET_ADDRSTRLEN 16 + #define INET6_ADDRSTRLEN 46 + + The inet_ntop() function shall return a pointer to the buffer + containing the text string if the conversion succeeds, and NULL + otherwise. Upon failure, errno is set to EAFNOSUPPORT if the af + argument is invalid or ENOSPC if the size of the result buffer is + inadequate. + + + + + + + + +Gilligan, et al. Informational [Page 32] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +6.4 Address Testing Macros + + The following macros can be used to test for special IPv6 addresses. + + #include + + int IN6_IS_ADDR_UNSPECIFIED (const struct in6_addr *); + int IN6_IS_ADDR_LOOPBACK (const struct in6_addr *); + int IN6_IS_ADDR_MULTICAST (const struct in6_addr *); + int IN6_IS_ADDR_LINKLOCAL (const struct in6_addr *); + int IN6_IS_ADDR_SITELOCAL (const struct in6_addr *); + int IN6_IS_ADDR_V4MAPPED (const struct in6_addr *); + int IN6_IS_ADDR_V4COMPAT (const struct in6_addr *); + + int IN6_IS_ADDR_MC_NODELOCAL(const struct in6_addr *); + int IN6_IS_ADDR_MC_LINKLOCAL(const struct in6_addr *); + int IN6_IS_ADDR_MC_SITELOCAL(const struct in6_addr *); + int IN6_IS_ADDR_MC_ORGLOCAL (const struct in6_addr *); + int IN6_IS_ADDR_MC_GLOBAL (const struct in6_addr *); + + The first seven macros return true if the address is of the specified + type, or false otherwise. The last five test the scope of a + multicast address and return true if the address is a multicast + address of the specified scope or false if the address is either not + a multicast address or not of the specified scope. + + Note that IN6_IS_ADDR_LINKLOCAL and IN6_IS_ADDR_SITELOCAL return true + only for the two types of local-use IPv6 unicast addresses (Link- + Local and Site-Local) defined in [2], and that by this definition, + the IN6_IS_ADDR_LINKLOCAL macro returns false for the IPv6 loopback + address (::1). These two macros do not return true for IPv6 + multicast addresses of either link-local scope or site-local scope. + +7. Summary of New Definitions + + The following list summarizes the constants, structure, and extern + definitions discussed in this memo, sorted by header. + + IF_NAMESIZE + struct if_nameindex{}; + + AI_ADDRCONFIG + AI_ALL + AI_CANONNAME + AI_NUMERICHOST + AI_NUMERICSERV + AI_PASSIVE + AI_V4MAPPED + + + +Gilligan, et al. Informational [Page 33] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + EAI_AGAIN + EAI_BADFLAGS + EAI_FAIL + EAI_FAMILY + EAI_MEMORY + EAI_NONAME + EAI_OVERFLOW + EAI_SERVICE + EAI_SOCKTYPE + EAI_SYSTEM + NI_DGRAM + NI_NAMEREQD + NI_NOFQDN + NI_NUMERICHOST + NI_NUMERICSERV + struct addrinfo{}; + + IN6ADDR_ANY_INIT + IN6ADDR_LOOPBACK_INIT + INET6_ADDRSTRLEN + INET_ADDRSTRLEN + IPPROTO_IPV6 + IPV6_JOIN_GROUP + IPV6_LEAVE_GROUP + IPV6_MULTICAST_HOPS + IPV6_MULTICAST_IF + IPV6_MULTICAST_LOOP + IPV6_UNICAST_HOPS + IPV6_V6ONLY + SIN6_LEN + extern const struct in6_addr in6addr_any; + extern const struct in6_addr in6addr_loopback; + struct in6_addr{}; + struct ipv6_mreq{}; + struct sockaddr_in6{}; + + AF_INET6 + PF_INET6 + struct sockaddr_storage; + + The following list summarizes the function and macro prototypes + discussed in this memo, sorted by header. + + int inet_pton(int, const char *, void *); + const char *inet_ntop(int, const void *, + char *, socklen_t); + + + + + +Gilligan, et al. Informational [Page 34] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + char *if_indextoname(unsigned int, char *); + unsigned int if_nametoindex(const char *); + void if_freenameindex(struct if_nameindex *); + struct if_nameindex *if_nameindex(void); + + int getaddrinfo(const char *, const char *, + const struct addrinfo *, + struct addrinfo **); + int getnameinfo(const struct sockaddr *, socklen_t, + char *, socklen_t, char *, socklen_t, int); + void freeaddrinfo(struct addrinfo *); + const char *gai_strerror(int); + + int IN6_IS_ADDR_LINKLOCAL(const struct in6_addr *); + int IN6_IS_ADDR_LOOPBACK(const struct in6_addr *); + int IN6_IS_ADDR_MC_GLOBAL(const struct in6_addr *); + int IN6_IS_ADDR_MC_LINKLOCAL(const struct in6_addr *); + int IN6_IS_ADDR_MC_NODELOCAL(const struct in6_addr *); + int IN6_IS_ADDR_MC_ORGLOCAL(const struct in6_addr *); + int IN6_IS_ADDR_MC_SITELOCAL(const struct in6_addr *); + int IN6_IS_ADDR_MULTICAST(const struct in6_addr *); + int IN6_IS_ADDR_SITELOCAL(const struct in6_addr *); + int IN6_IS_ADDR_UNSPECIFIED(const struct in6_addr *); + int IN6_IS_ADDR_V4COMPAT(const struct in6_addr *); + int IN6_IS_ADDR_V4MAPPED(const struct in6_addr *); + +8. Security Considerations + + IPv6 provides a number of new security mechanisms, many of which need + to be accessible to applications. Companion memos detailing the + extensions to the socket interfaces to support IPv6 security are + being written. + +9. Changes from RFC 2553 + + 1. Add brief description of the history of this API and its relation + to the Open Group/IEEE/ISO standards. + + 2. Alignments with [3]. + + 3. Removed all references to getipnodebyname() and getipnodebyaddr(), + which are deprecated in favor of getaddrinfo() and getnameinfo(). + + 4. Added IPV6_V6ONLY IP level socket option to permit nodes to not + process IPv4 packets as IPv4 Mapped addresses in implementations. + + 5. Added SIIT to references and added new contributors. + + + + +Gilligan, et al. Informational [Page 35] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + + 6. In previous versions of this specification, the sin6_flowinfo + field was associated with the IPv6 traffic class and flow label, + but its usage was not completely specified. The complete + definition of the sin6_flowinfo field, including its association + with the traffic class or flow label, is now deferred to a future + specification. + +10. Acknowledgments + + This specification's evolution and completeness were significantly + influenced by the efforts of Richard Stevens, who has passed on. + Richard's wisdom and talent made the specification what it is today. + The co-authors will long think of Richard with great respect. + + Thanks to the many people who made suggestions and provided feedback + to this document, including: + + Werner Almesberger, Ran Atkinson, Fred Baker, Dave Borman, Andrew + Cherenson, Alex Conta, Alan Cox, Steve Deering, Richard Draves, + Francis Dupont, Robert Elz, Brian Haberman, Jun-ichiro itojun Hagino, + Marc Hasson, Tom Herbert, Bob Hinden, Wan-Yen Hsu, Christian Huitema, + Koji Imada, Markus Jork, Ron Lee, Alan Lloyd, Charles Lynn, Dan + McDonald, Dave Mitton, Finnbarr Murphy, Thomas Narten, Josh Osborne, + Craig Partridge, Jean-Luc Richier, Bill Sommerfield, Erik Scoredos, + Keith Sklower, JINMEI Tatuya, Dave Thaler, Matt Thomas, Harvey + Thompson, Dean D. Throop, Karen Tracey, Glenn Trewitt, Paul Vixie, + David Waitzman, Carl Williams, Kazu Yamamoto, Vlad Yasevich, Stig + Venaas, and Brian Zill. + + The getaddrinfo() and getnameinfo() functions are taken from an + earlier document by Keith Sklower. As noted in that document, + William Durst, Steven Wise, Michael Karels, and Eric Allman provided + many useful discussions on the subject of protocol-independent name- + to-address translation, and reviewed early versions of Keith + Sklower's original proposal. Eric Allman implemented the first + prototype of getaddrinfo(). The observation that specifying the pair + of name and service would suffice for connecting to a service + independent of protocol details was made by Marshall Rose in a + proposal to X/Open for a "Uniform Network Interface". + + Craig Metz, Jack McCann, Erik Nordmark, Tim Hartrick, and Mukesh + Kacker made many contributions to this document. Ramesh Govindan + made a number of contributions and co-authored an earlier version of + this memo. + + + + + + + +Gilligan, et al. Informational [Page 36] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +11. References + + [1] Deering, S. and R. Hinden, "Internet Protocol, Version 6 (IPv6) + Specification", RFC 2460, December 1998. + + [2] Hinden, R. and S. Deering, "IP Version 6 Addressing + Architecture", RFC 2373, July 1998. + + [3] IEEE Std. 1003.1-2001 Standard for Information Technology -- + Portable Operating System Interface (POSIX). Open Group + Technical Standard: Base Specifications, Issue 6, December 2001. + ISO/IEC 9945:2002. http://www.opengroup.org/austin + + [4] Stevens, W. and M. Thomas, "Advanced Sockets API for IPv6", RFC + 2292, February 1998. + + [5] Nordmark, E., "Stateless IP/ICMP Translation Algorithm (SIIT)", + RFC 2765, February 2000. + + [6] The Open Group Base Working Group + http://www.opengroup.org/platform/base.html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Gilligan, et al. Informational [Page 37] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +12. Authors' Addresses + + Bob Gilligan + Intransa, Inc. + 2870 Zanker Rd. + San Jose, CA 95134 + + Phone: 408-678-8647 + EMail: gilligan@intransa.com + + + Susan Thomson + Cisco Systems + 499 Thornall Street, 8th floor + Edison, NJ 08837 + + Phone: 732-635-3086 + EMail: sethomso@cisco.com + + + Jim Bound + Hewlett-Packard Company + 110 Spitbrook Road ZKO3-3/W20 + Nashua, NH 03062 + + Phone: 603-884-0062 + EMail: Jim.Bound@hp.com + + + Jack McCann + Hewlett-Packard Company + 110 Spitbrook Road ZKO3-3/W20 + Nashua, NH 03062 + + Phone: 603-884-2608 + EMail: Jack.McCann@hp.com + + + + + + + + + + + + + + + +Gilligan, et al. Informational [Page 38] + +RFC 3493 Basic Socket Interface Extensions for IPv6 February 2003 + + +13. Full Copyright Statement + + Copyright (C) The Internet Society (2003). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + + + + + + + + + + + +Gilligan, et al. Informational [Page 39] + diff --git a/ext/picotcp/RFC/rfc3649.txt b/ext/picotcp/RFC/rfc3649.txt new file mode 100644 index 0000000..6a20e0d --- /dev/null +++ b/ext/picotcp/RFC/rfc3649.txt @@ -0,0 +1,1907 @@ + + + + + + +Network Working Group S. Floyd +Request for Comments: 3649 ICSI +Category: Experimental December 2003 + + + HighSpeed TCP for Large Congestion Windows + +Status of this Memo + + This memo defines an Experimental Protocol for the Internet + community. It does not specify an Internet standard of any kind. + Discussion and suggestions for improvement are requested. + Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2003). All Rights Reserved. + +Abstract + + The proposals in this document are experimental. While they may be + deployed in the current Internet, they do not represent a consensus + that this is the best method for high-speed congestion control. In + particular, we note that alternative experimental proposals are + likely to be forthcoming, and it is not well understood how the + proposals in this document will interact with such alternative + proposals. + + This document proposes HighSpeed TCP, a modification to TCP's + congestion control mechanism for use with TCP connections with large + congestion windows. The congestion control mechanisms of the current + Standard TCP constrains the congestion windows that can be achieved + by TCP in realistic environments. For example, for a Standard TCP + connection with 1500-byte packets and a 100 ms round-trip time, + achieving a steady-state throughput of 10 Gbps would require an + average congestion window of 83,333 segments, and a packet drop rate + of at most one congestion event every 5,000,000,000 packets (or + equivalently, at most one congestion event every 1 2/3 hours). This + is widely acknowledged as an unrealistic constraint. To address this + limitation of TCP, this document proposes HighSpeed TCP, and solicits + experimentation and feedback from the wider community. + + + + + + + + + + +Floyd Experimental [Page 1] + +RFC 3649 HighSpeed TCP December 2003 + + +Table of Contents + + 1. Introduction. . . . . . . . . . . . . . . . . . . . . . . . . . 2 + 2. The Problem Description.. . . . . . . . . . . . . . . . . . . . 3 + 3. Design Guidelines.. . . . . . . . . . . . . . . . . . . . . . . 4 + 4. Non-Goals.. . . . . . . . . . . . . . . . . . . . . . . . . . . 5 + 5. Modifying the TCP Response Function.. . . . . . . . . . . . . . 6 + 6. Fairness Implications of the HighSpeed Response + Function. . . . . . . . . . . . . . . . . . . . . . . . . . . . 9 + 7. Translating the HighSpeed Response Function into + Congestion Control Parameters . . . . . . . . . . . . . . . . . 12 + 8. An alternate, linear response functions.. . . . . . . . . . . . 13 + 9. Tradeoffs for Choosing Congestion Control Parameters. . . . . . 16 + 9.1. The Number of Round-Trip Times between Loss Events . . . . 17 + 9.2. The Number of Packet Drops per Loss Event, with Drop-Tail. 17 + 10. Related Issues . . . . . . . . . . . . . . . . . . . . . . . . 18 + 10.1. Slow-Start. . . . . . . . . . . . . . . . . . . . . . . . 18 + 10.2. Limiting burstiness on short time scales. . . . . . . . . 19 + 10.3. Other limitations on window size. . . . . . . . . . . . . 19 + 10.4. Implementation issues.. . . . . . . . . . . . . . . . . . 19 + 11. Deployment issues. . . . . . . . . . . . . . . . . . . . . . . 20 + 11.1. Deployment issues of HighSpeed TCP. . . . . . . . . . . . 20 + 11.2. Deployment issues of Scalable TCP . . . . . . . . . . . . 22 + 12. Related Work in HighSpeed TCP. . . . . . . . . . . . . . . . . 23 + 13. Relationship to other Work.. . . . . . . . . . . . . . . . . . 25 + 14. Conclusions. . . . . . . . . . . . . . . . . . . . . . . . . . 25 + 15. Acknowledgements . . . . . . . . . . . . . . . . . . . . . . . 25 + 16. Normative References . . . . . . . . . . . . . . . . . . . . . 26 + 17. Informative References . . . . . . . . . . . . . . . . . . . . 26 + 18. Security Considerations. . . . . . . . . . . . . . . . . . . . 28 + 19. IANA Considerations. . . . . . . . . . . . . . . . . . . . . . 28 + A. TCP's Loss Event Rate in Steady-State. . . . . . . . . . . . . 29 + B. A table for a(w) and b(w). . . . . . . . . . . . . . . . . . . 30 + C. Exploring the time to converge to fairness . . . . . . . . . . 32 + Author's Address . . . . . . . . . . . . . . . . . . . . . . . 33 + Full Copyright Statement . . . . . . . . . . . . . . . . . . . 34 + +1. Introduction + + This document proposes HighSpeed TCP, a modification to TCP's + congestion control mechanism for use with TCP connections with large + congestion windows. In a steady-state environment, with a packet + loss rate p, the current Standard TCP's average congestion window is + roughly 1.2/sqrt(p) segments. This places a serious constraint on + the congestion windows that can be achieved by TCP in realistic + environments. For example, for a Standard TCP connection with 1500- + byte packets and a 100 ms round-trip time, achieving a steady-state + throughput of 10 Gbps would require an average congestion window of + + + +Floyd Experimental [Page 2] + +RFC 3649 HighSpeed TCP December 2003 + + + 83,333 segments, and a packet drop rate of at most one congestion + event every 5,000,000,000 packets (or equivalently, at most one + congestion event every 1 2/3 hours). The average packet drop rate of + at most 2*10^(-10) needed for full link utilization in this + environment corresponds to a bit error rate of at most 2*10^(-14), + and this is an unrealistic requirement for current networks. + + To address this fundamental limitation of TCP and of the TCP response + function (the function mapping the steady-state packet drop rate to + TCP's average sending rate in packets per round-trip time), this + document describes a modified TCP response function for regimes with + higher congestion windows. This document also solicits + experimentation and feedback on HighSpeed TCP from the wider + community. + + Because HighSpeed TCP's modified response function would only take + effect with higher congestion windows, HighSpeed TCP does not modify + TCP behavior in environments with heavy congestion, and therefore + does not introduce any new dangers of congestion collapse. However, + if relative fairness between HighSpeed TCP connections is to be + preserved, then in our view any modification to the TCP response + function should be addressed in the IETF, rather than made as ad hoc + decisions by individual implementors or TCP senders. Modifications + to the TCP response function would also have implications for + transport protocols that use TFRC and other forms of equation-based + congestion control, as these congestion control mechanisms directly + use the TCP response function [RFC3448]. + + This proposal for HighSpeed TCP focuses specifically on a proposed + change to the TCP response function, and its implications for TCP. + This document does not address what we view as a separate fundamental + issue, of the mechanisms required to enable best-effort connections + to *start* with large initial windows. In our view, while HighSpeed + TCP proposes a somewhat fundamental change to the TCP response + function, at the same time it is a relatively simple change to + implement in a single TCP sender, and presents no dangers in terms of + congestion collapse. In contrast, in our view, the problem of + enabling connections to *start* with large initial windows is + inherently more risky and structurally more difficult, requiring some + form of explicit feedback from all of the routers along the path. + This is another reason why we would propose addressing the problem of + starting with large initial windows separately, and on a separate + timetable, from the problem of modifying the TCP response function. + + + + + + + + +Floyd Experimental [Page 3] + +RFC 3649 HighSpeed TCP December 2003 + + +2. The Problem Description + + This section describes the number of round-trip times between + congestion events required for a Standard TCP flow to achieve an + average throughput of B bps, given packets of D bytes and a round- + trip time of R seconds. A congestion event refers to a window of + data with one or more dropped or ECN-marked packets (where ECN stands + for Explicit Congestion Notification). + + From Appendix A, achieving an average TCP throughput of B bps + requires a loss event at most every BR/(12D) round-trip times. This + is illustrated in Table 1, for R = 0.1 seconds and D = 1500 bytes. + The table also gives the average congestion window W of BR/(8D), and + the steady-state packet drop rate P of 1.5/W^2. + + TCP Throughput (Mbps) RTTs Between Losses W P + --------------------- ------------------- ---- ----- + 1 5.5 8.3 0.02 + 10 55.5 83.3 0.0002 + 100 555.5 833.3 0.000002 + 1000 5555.5 8333.3 0.00000002 + 10000 55555.5 83333.3 0.0000000002 + + Table 1: RTTs Between Congestion Events for Standard TCP, for + 1500-Byte Packets and a Round-Trip Time of 0.1 Seconds. + + This document proposes HighSpeed TCP, a minimal modification to TCP's + increase and decrease parameters, for TCP connections with larger + congestion windows, to allow TCP to achieve high throughput with more + realistic requirements for the steady-state packet drop rate. + Equivalently, HighSpeed TCP has more realistic requirements for the + number of round-trip times between loss events. + +3. Design Guidelines + + Our proposal for HighSpeed TCP is motivated by the following + requirements: + + * Achieve high per-connection throughput without requiring + unrealistically low packet loss rates. + + * Reach high throughput reasonably quickly when in slow-start. + + * Reach high throughput without overly long delays when recovering + from multiple retransmit timeouts, or when ramping-up from a + period with small congestion windows. + + + + + +Floyd Experimental [Page 4] + +RFC 3649 HighSpeed TCP December 2003 + + + * No additional feedback or support required from routers: + + For example, the goal is for acceptable performance in both ECN- + capable and non-ECN-capable environments, and with Drop-Tail as well + as with Active Queue Management such as RED in the routers. + + * No additional feedback required from TCP receivers. + + * TCP-compatible performance in environments with moderate or high + congestion (e.g., packet drop rates of 1% or higher): + + Equivalently, the requirement is that there be no additional load on + the network (in terms of increased packet drop rates) in environments + with moderate or high congestion. + + * Performance at least as good as Standard TCP in environments with + moderate or high congestion. + + * Acceptable transient performance, in terms of increases in the + congestion window in one round-trip time, responses to severe + congestion, and convergence times to fairness. + + Currently, users wishing to achieve throughputs of 1 Gbps or more + typically open up multiple TCP connections in parallel, or use MulTCP + [CO98,GRK99], which behaves roughly like the aggregate of N virtual + TCP connections. While this approach suffices for the occasional + user on well-provisioned links, it leaves the parameter N to be + determined by the user, and results in more aggressive performance + and higher steady-state packet drop rates if used in environments + with periods of moderate or high congestion. We believe that a new + approach is needed that offers more flexibility, more effectively + scales to a wide range of available bandwidths, and competes more + fairly with Standard TCP in congested environments. + +4. Non-Goals + + The following are explicitly *not* goals of our work: + + * Non-goal: TCP-compatible performance in environments with very low + packet drop rates. + + We note that our proposal does not require, or deliver, TCP- + compatible performance in environments with very low packet drop + rates, e.g., with packet loss rates of 10^-5 or 10^-6. As we discuss + later in this document, we assume that Standard TCP is unable to make + effective use of the available bandwidth in environments with loss + + + + + +Floyd Experimental [Page 5] + +RFC 3649 HighSpeed TCP December 2003 + + + rates of 10^-6 in any case, so that it is acceptable and appropriate + for HighSpeed TCP to perform more aggressively than Standard TCP in + such an environment. + + * Non-goal: Ramping-up more quickly than allowed by slow-start. + + It is our belief that ramping-up more quickly than allowed by slow- + start would necessitate more explicit feedback from routers along the + path. The proposal for HighSpeed TCP is focused on changes to TCP + that could be effectively deployed in the current Internet + environment. + + * Non-goal: Avoiding oscillations in environments with only one-way, + long-lived flows all with the same round-trip times. + + While we agree that attention to oscillatory behavior is useful, + avoiding oscillations in aggregate throughput has not been our + primary consideration, particularly for simplified environments + limited to one-way, long-lived flows all with the same, large round- + trip times. Our assessment is that some oscillatory behavior in + these extreme environments is an acceptable price to pay for the + other benefits of HighSpeed TCP. + +5. Modifying the TCP Response Function + + The TCP response function, w = 1.2/sqrt(p), gives TCP's average + congestion window w in MSS-sized segments, as a function of the + steady-state packet drop rate p [FF98]. This TCP response function + is a direct consequence of TCP's Additive Increase Multiplicative + Decrease (AIMD) mechanisms of increasing the congestion window by + roughly one segment per round-trip time in the absence of congestion, + and halving the congestion window in response to a round-trip time + with a congestion event. This response function for Standard TCP is + reflected in the table below. In this proposal we restrict our + attention to TCP performance in environments with packet loss rates + of at most 10^-2, and so we can ignore the more complex response + functions that are required to model TCP performance in more + congested environments with retransmit timeouts. From Appendix A, an + average congestion window of W corresponds to an average of 2/3 W + round-trip times between loss events for Standard TCP (with the + congestion window varying from 2/3 W to 4/3 W). + + + + + + + + + + +Floyd Experimental [Page 6] + +RFC 3649 HighSpeed TCP December 2003 + + + Packet Drop Rate P Congestion Window W RTTs Between Losses + ------------------ ------------------- ------------------- + 10^-2 12 8 + 10^-3 38 25 + 10^-4 120 80 + 10^-5 379 252 + 10^-6 1200 800 + 10^-7 3795 2530 + 10^-8 12000 8000 + 10^-9 37948 25298 + 10^-10 120000 80000 + + Table 2: TCP Response Function for Standard TCP. The average + congestion window W in MSS-sized segments is given as a function of + the packet drop rate P. + + To specify a modified response function for HighSpeed TCP, we use + three parameters, Low_Window, High_Window, and High_P. To ensure TCP + compatibility, the HighSpeed response function uses the same response + function as Standard TCP when the current congestion window is at + most Low_Window, and uses the HighSpeed response function when the + current congestion window is greater than Low_Window. In this + document we set Low_Window to 38 MSS-sized segments, corresponding to + a packet drop rate of 10^-3 for TCP. + + To specify the upper end of the HighSpeed response function, we + specify the packet drop rate needed in the HighSpeed response + function to achieve an average congestion window of 83000 segments. + This is roughly the window needed to sustain 10 Gbps throughput, for + a TCP connection with the default packet size and round-trip time + used earlier in this document. For High_Window set to 83000, we + specify High_P of 10^-7; that is, with HighSpeed TCP a packet drop + rate of 10^-7 allows the HighSpeed TCP connection to achieve an + average congestion window of 83000 segments. We believe that this + loss rate sets an achievable target for high-speed environments, + while still allowing acceptable fairness for the HighSpeed response + function when competing with Standard TCP in environments with packet + drop rates of 10^-4 or 10^5. + + For simplicity, for the HighSpeed response function we maintain the + property that the response function gives a straight line on a log- + log scale (as does the response function for Standard TCP, for low to + moderate congestion). This results in the following response + function, for values of the average congestion window W greater than + Low_Window: + + W = (p/Low_P)^S Low_Window, + + + + +Floyd Experimental [Page 7] + +RFC 3649 HighSpeed TCP December 2003 + + + for Low_P the packet drop rate corresponding to Low_Window, and for S + as following constant [FRS02]: + + S = (log High_Window - log Low_Window)/(log High_P - log Low_P). + + (In this paper, "log x" refers to the log base 10.) For example, for + Low_Window set to 38, we have Low_P of 10^-3 (for compatibility with + Standard TCP). Thus, for High_Window set to 83000 and High_P set to + 10^-7, we get the following response function: + + W = 0.12/p^0.835. (1) + + This HighSpeed response function is illustrated in Table 3 below. + For HighSpeed TCP, the number of round-trip times between losses, + 1/(pW), equals 12.7 W^0.2, for W > 38 segments. + + Packet Drop Rate P Congestion Window W RTTs Between Losses + ------------------ ------------------- ------------------- + 10^-2 12 8 + 10^-3 38 25 + 10^-4 263 38 + 10^-5 1795 57 + 10^-6 12279 83 + 10^-7 83981 123 + 10^-8 574356 180 + 10^-9 3928088 264 + 10^-10 26864653 388 + + Table 3: TCP Response Function for HighSpeed TCP. The average + congestion window W in MSS-sized segments is given as a function of + the packet drop rate P. + + We believe that the problem of backward compatibility with Standard + TCP requires a response function that is quite close to that of + Standard TCP for loss rates of 10^-1, 10^-2, or 10^-3. We believe, + however, that such stringent TCP-compatibility is not required for + smaller loss rates, and that an appropriate response function is one + that gives a plausible packet drop rate for a connection throughput + of 10 Gbps. This also gives a slowly increasing number of round-trip + times between loss events as a function of a decreasing packet drop + rate. + + Another way to look at the HighSpeed response function is to consider + that HighSpeed TCP is roughly emulating the congestion control + response of N parallel TCP connections, where N is initially one, and + where N increases as a function of the HighSpeed TCP's congestion + window. Thus for the HighSpeed response function in Equation (1) + above, the response function can be viewed as equivalent to that of + + + +Floyd Experimental [Page 8] + +RFC 3649 HighSpeed TCP December 2003 + + + N(W) parallel TCP connections, where N(W) varies as a function of the + congestion window W. Recall that for a single standard TCP + connection, the average congestion window equals 1.2/sqrt(p). For N + parallel TCP connections, the aggregate congestion window for the N + connections equals N*1.2/sqrt(p). From the HighSpeed response + function in Equation (1) and the relationship above, we can derive + the following: + + N(W) = 0.23*W^(0.4) + + for N(W) the number of parallel TCP connections emulated by the + HighSpeed TCP response function, and for N(W) >= 1. This is shown in + Table 4 below. + + Congestion Window W Number N(W) of Parallel TCPs + ------------------- ------------------------- + 1 1 + 10 1 + 100 1.4 + 1,000 3.6 + 10,000 9.2 + 100,000 23.0 + + Table 4: Number N(W) of parallel TCP connections roughly emulated by + the HighSpeed TCP response function. + + In this document, we do not attempt to seriously evaluate the + HighSpeed response function for congestion windows greater than + 100,000 packets. We believe that we will learn more about the + requirements for sustaining the throughput of best-effort connections + in that range as we gain more experience with HighSpeed TCP with + congestion windows of thousands and tens of thousands of packets. + There also might be limitations to the per-connection throughput that + can be realistically achieved for best-effort traffic, in terms of + congestion window of hundreds of thousands of packets or more, in the + absence of additional support or feedback from the routers along the + path. + +6. Fairness Implications of the HighSpeed Response Function + + The Standard and Highspeed Response Functions can be used directly to + infer the relative fairness between flows using the two response + functions. For example, given a packet drop rate P, assume that + Standard TCP has an average congestion window of W_Standard, and + HighSpeed TCP has a higher average congestion window of W_HighSpeed. + + + + + + +Floyd Experimental [Page 9] + +RFC 3649 HighSpeed TCP December 2003 + + + In this case, a single HighSpeed TCP connection is receiving + W_HighSpeed/W_Standard times the throughput of a single Standard TCP + connection competing in the same environment. + + This relative fairness is illustrated below in Table 5, for the + parameters used for the Highspeed response function in the section + above. The second column gives the relative fairness, for the + steady-state packet drop rate specified in the first column. To help + calibrate, the third column gives the aggregate average congestion + window for the two TCP connections, and the fourth column gives the + bandwidth that would be needed by the two connections to achieve that + aggregate window and packet drop rate, given 100 ms round-trip times + and 1500-byte packets. + + Packet Drop Rate P Fairness Aggregate Window Bandwidth + ------------------ -------- ---------------- --------- + 10^-2 1.0 24 2.8 Mbps + 10^-3 1.0 76 9.1 Mbps + 10^-4 2.2 383 45.9 Mbps + 10^-5 4.7 2174 260.8 Mbps + 10^-6 10.2 13479 1.6 Gbps + 10^-7 22.1 87776 10.5 Gbps + + Table 5: Relative Fairness between the HighSpeed and Standard + Response Functions. + + Thus, for packet drop rates of 10^-4, a flow with the HighSpeed + response function can expect to receive 2.2 times the throughput of a + flow using the Standard response function, given the same round-trip + times and packet sizes. With packet drop rates of 10^-6 (or 10^-7), + the unfairness is more severe, and we have entered the regime where a + Standard TCP connection requires at most one congestion event every + 800 (or 2530) round-trip times in order to make use of the available + bandwidth. Our judgement would be that there are not a lot of TCP + connections effectively operating in this regime today, with + congestion windows of thousands of packets, and that therefore the + benefits of the HighSpeed response function would outweigh the + unfairness that would be experienced by Standard TCP in this regime. + However, one purpose of this document is to solicit feedback on this + issue. The parameter Low_Window determines directly the point of + divergence between the Standard and HighSpeed Response Functions. + + The third column of Table 5, the Aggregate Window, gives the + aggregate congestion window of the two competing TCP connections, + with HighSpeed and Standard TCP, given the packet drop rate specified + in the first column. From Table 5, a HighSpeed TCP connection would + receive ten times the bandwidth of a Standard TCP in an environment + with a packet drop rate of 10^-6. This would occur when the two + + + +Floyd Experimental [Page 10] + +RFC 3649 HighSpeed TCP December 2003 + + + flows sharing a single pipe achieved an aggregate window of 13479 + packets. Given a round-trip time of 100 ms and a packet size of 1500 + bytes, this would occur with an available bandwidth for the two + competing flows of 1.6 Gbps. + + Next we consider the time that it takes a standard or HighSpeed TCP + flow to converge to fairness against a pre-existing HighSpeed TCP + flow. The worst case for convergence to fairness occurs when a new + flow is starting up, competing against a high-bandwidth existing + flow, and the new flow suffers a packet drop and exits slow-start + while its window is still small. In the worst case, consider that + the new flow has entered the congestion avoidance phase while its + window is only one packet. A standard TCP flow in congestion + avoidance increases its window by at most one packet per round-trip + time, and after N round-trip times has only achieved a window of N + packets (when starting with a window of 1 in the first round-trip + time). In contrast, a HighSpeed TCP flows increases much faster than + a standard TCP flow while in the congestion avoidance phase, and we + can expect its convergence to fairness to be much better. This is + shown in Table 6 below. The script used to generate this table is + given in Appendix C. + + RTT HS_Window Standard_TCP_Window + --- --------- ------------------- + 100 131 100 + 200 475 200 + 300 1131 300 + 400 2160 400 + 500 3601 500 + 600 5477 600 + 700 7799 700 + 800 10567 800 + 900 13774 900 + 1000 17409 1000 + 1100 21455 1100 + 1200 25893 1200 + 1300 30701 1300 + 1400 35856 1400 + 1500 41336 1500 + 1600 47115 1600 + 1700 53170 1700 + 1800 59477 1800 + 1900 66013 1900 + 2000 72754 2000 + + Table 6: For a HighSpeed and a Standard TCP connection, the + congestion window during congestion avoidance phase (starting with a + congestion window of 1 packet during RTT 1). + + + +Floyd Experimental [Page 11] + +RFC 3649 HighSpeed TCP December 2003 + + + The classic paper on relative fairness is from Chiu and Jain [CJ89]. + This paper shows that AIMD (Additive Increase Multiplicative + Decrease) converges to fairness in an environment with synchronized + congestion events. From [CJ89], it is easy to see that MIMD and AIAD + do not converge to fairness in this environment. However, the + results of [CJ89] do not apply to an asynchronous environment such as + that of the current Internet, where the frequency of congestion + feedback can be different for different flows. For example, it has + been shown that MIMD converges to fair states in a model with + proportional instead of synchronous feedback in terms of packet drops + [GV02]. Thus, we are not concerned about abandoning a strict model + of AIMD for HighSpeed TCP. However, we note that in an environment + with Drop-Tail queue management, there is likely to be some + synchronization of packet drops. In this environment, the model of + completely synchronous feedback does not hold, but the model of + completely asynchronous feedback is not accurate either. Fairness in + Drop-Tail environments is discussed in more detail in Sections 9 and + 12. + +7. Translating the HighSpeed Response Function into Congestion Control + Parameters + + For equation-based congestion control such as TFRC, the HighSpeed + Response Function above could be used directly by the TFRC congestion + control mechanism. However, for TCP the HighSpeed response function + has to be translated into additive increase and multiplicative + decrease parameters. The HighSpeed response function cannot be + achieved by TCP with an additive increase of one segment per round- + trip time and a multiplicative decrease of halving the current + congestion window; HighSpeed TCP will have to modify either the + increase or the decrease parameter, or both. We have concluded that + HighSpeed TCP is most likely to achieve an acceptable compromise + between moderate increases and timely decreases by modifying both the + increase and the decrease parameter. + + That is, for HighSpeed TCP let the congestion window increase by a(w) + segments per round-trip time in the absence of congestion, and let + the congestion window decrease to w(1-b(w)) segments in response to a + round-trip time with one or more loss events. Thus, in response to a + single acknowledgement HighSpeed TCP increases its congestion window + in segments as follows: + + w <- w + a(w)/w. + + In response to a congestion event, HighSpeed TCP decreases as + follows: + + w <- (1-b(w))w. + + + +Floyd Experimental [Page 12] + +RFC 3649 HighSpeed TCP December 2003 + + + For Standard TCP, a(w) = 1 and b(w) = 1/2, regardless of the value of + w. HighSpeed TCP uses the same values of a(w) and b(w) for w <= + Low_Window. This section specifies a(w) and b(w) for HighSpeed TCP + for larger values of w. + + For w = High_Window, we have specified a loss rate of High_P. From + [FRS02], or from elementary calculations, this requires the following + relationship between a(w) and b(w) for w = High_Window: + + a(w) = High_Window^2 * High_P * 2 * b(w)/(2-b(w)). (2) + + We use the parameter High_Decrease to specify the decrease parameter + b(w) for w = High_Window, and use Equation (2) to derive the increase + parameter a(w) for w = High_Window. Along with High_P = 10^-7 and + High_Window = 83000, for example, we specify High_Decrease = 0.1, + specifying that b(83000) = 0.1, giving a decrease of 10% after a + congestion event. Equation (2) then gives a(83000) = 72, for an + increase of 72 segments, or just under 0.1%, within a round-trip + time, for w = 83000. + + This moderate decrease strikes us as acceptable, particularly when + coupled with the role of TCP's ACK-clocking in limiting the sending + rate in response to more severe congestion [BBFS01]. A more severe + decrease would require a more aggressive increase in the congestion + window for a round-trip time without congestion. In particular, a + decrease factor High_Decrease of 0.5, as in Standard TCP, would + require an increase of 459 segments per round-trip time when w = + 83000. + + Given decrease parameters of b(w) = 1/2 for w = Low_Window, and b(w) + = High_Decrease for w = High_Window, we are left to specify the value + of b(w) for other values of w > Low_Window. From [FRS02], we let + b(w) vary linearly as the log of w, as follows: + + b(w) = (High_Decrease - 0.5) (log(w)-log(W)) / (log(W_1)-log(W)) + + 0.5, + + for W = Low_window and W_1 = High_window. The increase parameter + a(w) can then be computed as follows: + + a(w) = w^2 * p(w) * 2 * b(w)/(2-b(w)), + + for p(w) the packet drop rate for congestion window w. From + inverting Equation (1), we get p(w) as follows: + + p(w) = 0.078/w^1.2. + + + + + +Floyd Experimental [Page 13] + +RFC 3649 HighSpeed TCP December 2003 + + + We assume that experimental implementations of HighSpeed TCP for + further investigation will use a pre-computed look-up table for + finding a(w) and b(w). For example, the implementation from Tom + Dunigan adjusts the a(w) and b(w) parameters every 0.1 seconds. In + the appendix we give such a table for our default values of + Low_Window = 38, High_Window = 83,000, High_P = 10^-7, and + High_Decrease = 0.1. These are also the default values in the NS + simulator; example simulations in NS can be run with the command + "./test-all-tcpHighspeed" in the directory tcl/test. + +8. An alternate, linear response functions + + In this section we explore an alternate, linear response function for + HighSpeed TCP that has been proposed by a number of other people, in + particular by Glenn Vinnicombe and Tom Kelly. Similarly, it has been + suggested by others that a less "ad-hoc" guideline for a response + function for HighSpeed TCP would be to specify a constant value for + the number of round-trip times between congestion events. + + Assume that we keep the value of Low_Window as 38 MSS-sized segments, + indicating when the HighSpeed response function diverges from the + current TCP response function, but that we modify the High_Window and + High_P parameters that specify the upper range of the HighSpeed + response function. In particular, consider the response function + given by High_Window = 380,000 and High_P = 10^-7, with Low_Window = + 38 and Low_P = 10^-3 as before. + + Using the equations in Section 5, this would give the following + Linear response function, for w > Low_Window: + + W = 0.038/p. + + This Linear HighSpeed response function is illustrated in Table 7 + below. For HighSpeed TCP, the number of round-trip times between + losses, 1/(pW), equals 1/0.38, or equivalently, 26, for W > 38 + segments. + + + + + + + + + + + + + + + +Floyd Experimental [Page 14] + +RFC 3649 HighSpeed TCP December 2003 + + + Packet Drop Rate P Congestion Window W RTTs Between Losses + ------------------ ------------------- ------------------- + 10^-2 12 8 + 10^-3 38 26 + 10^-4 380 26 + 10^-5 3800 26 + 10^-6 38000 26 + 10^-7 380000 26 + 10^-8 3800000 26 + 10^-9 38000000 26 + 10^-10 380000000 26 + + Table 7: An Alternate, Linear TCP Response Function for HighSpeed + TCP. The average congestion window W in MSS-sized segments is given + as a function of the packet drop rate P. + + Given a constant decrease b(w) of 1/2, this would give an increase + a(w) of w/Low_Window, or equivalently, a constant increase of + 1/Low_Window packets per acknowledgement, for w > Low_Window. + Another possibility is Scalable TCP [K03], which uses a fixed + decrease b(w) of 1/8 and a fixed increase per acknowledgement of + 0.01. This gives an increase a(w) per window of 0.005 w, for a TCP + with delayed acknowledgements, for pure MIMD. + + The relative fairness between the alternate Linear response function + and the standard TCP response function is illustrated below in Table + 8. + + Packet Drop Rate P Fairness Aggregate Window Bandwidth + ------------------ -------- ---------------- --------- + 10^-2 1.0 24 2.8 Mbps + 10^-3 1.0 76 9.1 Mbps + 10^-4 3.2 500 60.0 Mbps + 10^-5 15.1 4179 501.4 Mbps + 10^-6 31.6 39200 4.7 Gbps + 10^-7 100.1 383795 46.0 Gbps + + Table 8: Relative Fairness between the Linear HighSpeed and Standard + Response Functions. + + One attraction of the linear response function is that it is scale- + invariant, with a fixed increase in the congestion window per + acknowledgement, and a fixed number of round-trip times between loss + events. My own assumption would be that having a fixed length for + the congestion epoch in round-trip times, regardless of the packet + drop rate, would be a poor fit for an imprecise and imperfect world + with routers with a range of queue management mechanisms, such as the + Drop-Tail queue management that is common today. For example, a + + + +Floyd Experimental [Page 15] + +RFC 3649 HighSpeed TCP December 2003 + + + response function with a fixed length for the congestion epoch in + round-trip times might give less clearly-differentiated feedback in + an environment with steady-state background losses at fixed intervals + for all flows (as might occur with a wireless link with occasional + short error bursts, giving losses for all flows every N seconds + regardless of their sending rate). + + While it is not a goal to have perfect fairness in an environment + with synchronized losses, it would be good to have moderately + acceptable performance in this regime. This goal might argue against + a response function with a constant number of round-trip times + between congestion events. However, this is a question that could + clearly use additional research and investigation. In addition, + flows with different round-trip times would have different time + durations for congestion epochs even in the model with a linear + response function. + + The third column of Table 8, the Aggregate Window, gives the + aggregate congestion window of two competing TCP connections, one + with Linear HighSpeed TCP and one with Standard TCP, given the packet + drop rate specified in the first column. From Table 8, a Linear + HighSpeed TCP connection would receive fifteen times the bandwidth of + a Standard TCP in an environment with a packet drop rate of 10^-5. + This would occur when the two flows sharing a single pipe achieved an + aggregate window of 4179 packets. Given a round-trip time of 100 ms + and a packet size of 1500 bytes, this would occur with an available + bandwidth for the two competing flows of 501 Mbps. Thus, because the + Linear HighSpeed TCP is more aggressive than the HighSpeed TCP + proposed above, it also is less fair when competing with Standard TCP + in a high-bandwidth environment. + +9. Tradeoffs for Choosing Congestion Control Parameters + + A range of metrics can be used for evaluating choices for congestion + control parameters for HighSpeed TCP. My assumption in this section + is that for a response function of the form w = c/p^d, for constant c + and exponent d, the only response functions that would be considered + are response functions with 1/2 <= d <= 1. The two ends of this + spectrum are represented by current TCP, with d = 1/2, and by the + linear response function described in Section 8 above, with d = 1. + HighSpeed TCP lies somewhere in the middle of the spectrum, with d = + 0.835. + + Response functions with exponents less than 1/2 can be eliminated + from consideration because they would be even worse than standard TCP + in accommodating connections with high congestion windows. + + + + + +Floyd Experimental [Page 16] + +RFC 3649 HighSpeed TCP December 2003 + + +9.1. The Number of Round-Trip Times between Loss Events + + Response functions with exponents greater than 1 can be eliminated + from consideration because for these response functions, the number + of round-trip times between loss events decreases as congestion + decreases. For a response function of w = c/p^d, with one loss event + or congestion event every 1/p packets, the number of round-trip times + between loss events is w^((1/d)-1)/c^(1/d). Thus, for standard TCP + the number of round-trip times between loss events is linear in w. + In contrast, one attraction of the linear response function, as + described in Section 8 above, is that it is scale-invariant, in terms + of a fixed increase in the congestion window per acknowledgement, and + a fixed number of round-trip times between loss events. + + However, for a response function with d > 1, the number of round- + trip times between loss events would be proportional to w^((1/d)-1), + for a negative exponent ((1/d)-1), setting smaller as w increases. + This would seem undesirable. + +9.2. The Number of Packet Drops per Loss Event, with Drop-Tail + + A TCP connection increases its sending rate by a(w) packets per + round-trip time, and in a Drop-Tail environment, this is likely to + result in a(w) dropped packets during a single loss event. One + attraction of standard TCP is that it has a fixed increase per + round-trip time of one packet, minimizing the number of packets that + would be dropped in a Drop-Tail environment. For an environment with + some form of Active Queue Management, and in particular for an + environment that uses ECN, the number of packets dropped in a single + congestion event would not be a problem. However, even in these + environments, larger increases in the sending rate per round-trip + time result in larger stresses on the ability of the queues in the + router to absorb the fluctuations. + + HighSpeed TCP plays a middle ground between the metrics of a moderate + number of round-trip times between loss events, and a moderate + increase in the sending rate per round-trip time. As shown in + Appendix B, for a congestion window of 83,000 packets, HighSpeed TCP + increases its sending rate by 70 packets per round-trip time, + resulting in at most 70 packet drops when the buffer overflows in a + Drop-Tail environment. This increased aggressiveness is the price + paid by HighSpeed TCP for its increased scalability. A large number + of packets dropped per congestion event could result in synchronized + drops from multiple flows, with a possible loss of throughput as a + result. + + + + + + +Floyd Experimental [Page 17] + +RFC 3649 HighSpeed TCP December 2003 + + + Scalable TCP has an increase a(w) of 0.005 w packets per round-trip + time. For a congestion window of 83,000 packets, this gives an + increase of 415 packets per round-trip time, resulting in roughly 415 + packet drops per congestion event in a Drop-Tail environment. + + Thus, HighSpeed TCP and its variants place increased demands on queue + management in routers, relative to Standard TCP. (This is rather + similar to the increased demands on queue management that would + result from using N parallel TCP connections instead of a single + Standard TCP connection.) + +10. Related Issues + +10.1. Slow-Start + + A companion internet-draft on "Limited Slow-Start for TCP with Large + Congestion Windows" [F02b] proposes a modification to TCP's slow- + start procedure that can significantly improve the performance of TCP + connections slow-starting up to large congestion windows. For TCP + connections that are able to use congestion windows of thousands (or + tens of thousands) of MSS-sized segments (for MSS the sender's + MAXIMUM SEGMENT SIZE), the current slow-start procedure can result in + increasing the congestion window by thousands of segments in a single + round-trip time. Such an increase can easily result in thousands of + packets being dropped in one round-trip time. This is often + counter-productive for the TCP flow itself, and is also hard on the + rest of the traffic sharing the congested link. + + [F02b] proposes Limited Slow-Start, limiting the number of segments + by which the congestion window is increased for one window of data + during slow-start, in order to improve performance for TCP + connections with large congestion windows. We have separated out + Limited Slow-Start to a separate draft because it can be used both + with Standard or with HighSpeed TCP. + + Limited Slow-Start is illustrated in the NS simulator, for snapshots + after May 1, 2002, in the tests "./test-all-tcpHighspeed tcp1A" and + "./test-all-tcpHighspeed tcpHighspeed1" in the subdirectory + "tcl/lib". + + In order for best-effort flows to safely start-up faster than slow- + start, e.g., in future high-bandwidth networks, we believe that it + would be necessary for the flow to have explicit feedback from the + routers along the path. There are a number of proposals for this, + ranging from a minimal proposal for an IP option that allows TCP SYN + packets to collect information from routers along the path about the + allowed initial sending rate [J02], to proposals with more power that + require more fine-tuned and continuous feedback from routers. These + + + +Floyd Experimental [Page 18] + +RFC 3649 HighSpeed TCP December 2003 + + + proposals are all somewhat longer-term proposals than the HighSpeed + TCP proposal in this document, requiring longer lead times and more + coordination for deployment, and will be discussed in later + documents. + +10.2. Limiting burstiness on short time scales + + Because the congestion window achieved by a HighSpeed TCP connection + could be quite large, there is a possibility for the sender to send a + large burst of packets in response to a single acknowledgement. This + could happen, for example, when there is congestion or reordering on + the reverse path, and the sender receives an acknowledgement + acknowledging hundreds or thousands of new packets. Such a burst + would also result if the application was idle for a short period of + time less than a round-trip time, and then suddenly had lots of data + available to send. In this case, it would be useful for the + HighSpeed TCP connection to have some method for limiting bursts. + + In this document, we do not specify TCP mechanisms for reducing the + short-term burstiness. One possible mechanism is to use some form of + rate-based pacing, and another possibility is to use maxburst, which + limits the number of packets that are sent in response to a single + acknowledgement. We would caution, however, against a permanent + reduction in the congestion window as a mechanism for limiting + short-term bursts. Such a mechanism has been deployed in some TCP + stacks, and our view would be that using permanent reductions of the + congestion window to reduce transient bursts would be a bad idea + [Fl03]. + +10.3. Other limitations on window size + + The TCP header uses a 16-bit field to report the receive window size + to the sender. Unmodified, this allows a window size of at most + 2**16 = 65K bytes. With window scaling, the maximum window size is + 2**30 = 1073M bytes [RFC 1323]. Given 1500-byte packets, this allows + a window of up to 715,000 packets. + +10.4. Implementation issues + + One implementation issue that has been raised with HighSpeed TCP is + that with congestion windows of 4MB or more, the handling of + successive SACK packets after a packet is dropped becomes very time- + consuming at the TCP sender [S03]. Tom Kelly's Scalable TCP includes + a "SACK Fast Path" patch that addresses this problem. + + The issues addressed in the Web100 project, the Net100 project, and + related projects about the tuning necessary to achieve high bandwidth + data rates with TCP apply to HighSpeed TCP as well [Net100, Web100]. + + + +Floyd Experimental [Page 19] + +RFC 3649 HighSpeed TCP December 2003 + + +11. Deployment issues + +11.1. Deployment issues of HighSpeed TCP + + We do not claim that the HighSpeed TCP modification to TCP described + in this paper is an optimal transport protocol for high-bandwidth + environments. Based on our experiences with HighSpeed TCP in the NS + simulator [NS], on simulation studies [SA03], and on experimental + reports [ABLLS03,D02,CC03,F03], we believe that HighSpeed TCP + improves the performance of TCP in high-bandwidth environments, and + we are documenting it for the benefit of the IETF community. We + encourage the use of HighSpeed TCP, and of its underlying response + function, and we further encourage feedback about operational + experiences with this or related modifications. + + We note that in environments typical of much of the current Internet, + HighSpeed TCP behaves exactly as does Standard TCP today. This is + the case any time the congestion window is less than 38 segments. + + Bandwidth Avg Cwnd w (pkts) Increase a(w) Decrease b(w) + --------- ----------------- ------------- ------------- + 1.5 Mbps 12.5 1 0.50 + 10 Mbps 83 1 0.50 + 100 Mbps 833 6 0.35 + 1 Gbps 8333 26 0.22 + 10 Gbps 83333 70 0.10 + + Table 9: Performance of a HighSpeed TCP connection + + To help calibrate, Table 9 considers a TCP connection with 1500-byte + packets, an RTT of 100 ms (including average queueing delay), and no + competing traffic, and shows the average congestion window if that + TCP connection had a pipe all to itself and fully used the link + bandwidth, for a range of bandwidths for the pipe. This assumes that + the TCP connection would use Table 12 in determining its increase and + decrease parameters. The first column of Table 9 gives the + bandwidth, and the second column gives the average congestion window + w needed to utilize that bandwidth. The third column shows the + increase a(w) in segments per RTT for window w. The fourth column + shows the decrease b(w) for that window w (where the TCP sender + decreases the congestion window from w to w(1-b(w)) segments after a + loss event). When a loss occurs we note that the actual congestion + window is likely to be greater than the average congestion window w + in column 2, so the decrease parameter used could be slightly smaller + than the one given in column 4 of Table 9. + + Table 9 shows that a HighSpeed TCP over a 10 Mbps link behaves + exactly the same as a Standard TCP connection, even in the absence of + + + +Floyd Experimental [Page 20] + +RFC 3649 HighSpeed TCP December 2003 + + + competing traffic. One can think of the congestion window staying + generally in the range of 55 to 110 segments, with the HighSpeed TCP + behavior being exactly the same as the behavior of Standard TCP. (If + the congestion window is ever 128 segments or more, then the + HighSpeed TCP increases by two segments per RTT instead of by one, + and uses a decrease parameter of 0.44 instead of 0.50.) + + Table 9 shows that for a HighSpeed TCP connection over a 100 Mbps + link, with no competing traffic, HighSpeed TCP behaves roughly as + aggressively as six parallel TCP connections, increasing its + congestion window by roughly six segments per round-trip time, and + with a decrease parameter of roughly 1/3 (corresponding to decreasing + down to 2/3-rds of its old congestion window, rather than to half, in + response to a loss event). + + For a Standard TCP connection in this environment, the congestion + window could be thought of as generally varying in the range of 550 + to 1100 segments, with an average packet drop rate of 2.2 * 10^-6 + (corresponding to a bit error rate of 1.8 * 10^-10), or equivalently, + roughly 55 seconds between congestion events. While a Standard TCP + connection could sustain such a low packet drop rate in a carefully + controlled environment with minimal competing traffic, we would + contend that in an uncontrolled best-effort environment with even a + small amount of competing traffic, the occasional congestion events + from smaller competing flows could easily be sufficient to prevent a + Standard TCP flow with no lower-speed bottlenecks from fully + utilizing the available bandwidth of the underutilized 100 Mbps link. + + That is, we would contend that in the environment of 100 Mbps links + with a significant amount of available bandwidth, Standard TCP would + sometimes be unable to fully utilize the link bandwidth, and that + HighSpeed TCP would be an improvement in this regard. We would + further contend that in this environment, the behavior of HighSpeed + TCP is sufficiently close to that of Standard TCP that HighSpeed TCP + would be safe to deploy in the current Internet. We note that + HighSpeed TCP can only use high congestion windows if allowed by the + receiver's advertised window size. As a result, even if HighSpeed + TCP was ubiquitously deployed in the Internet, the impact would be + limited to those TCP connections with an advertised window from the + receiver of 118 MSS or larger. + + We do not believe that the deployment of HighSpeed TCP would serve as + a block to the possible deployment of alternate experimental + protocols for high-speed congestion control, such as Scalable TCP, + XCP [KHR02], or FAST TCP [JWL03]. In particular, we don't expect + HighSpeed TCP to interact any more poorly with alternative + experimental proposals than would the N parallel TCP connections + commonly used today in the absence of HighSpeed TCP. + + + +Floyd Experimental [Page 21] + +RFC 3649 HighSpeed TCP December 2003 + + +11.2. Deployment issues of Scalable TCP + + We believe that Scalable TCP and HighSpeed TCP have sufficiently + similar response functions that they could easily coexist in the + Internet. However, we have not investigated Scalable TCP + sufficiently to be able to claim, in this document, that Scalable TCP + is safe for a widespread deployment in the current Internet. + + Bandwidth Avg Cwnd w (pkts) Increase a(w) Decrease b(w) + --------- ----------------- ------------- ------------- + 1.5 Mbps 12.5 1 0.50 + 10 Mbps 83 0.4 0.125 + 100 Mbps 833 4.1 0.125 + 1 Gbps 8333 41.6 0.125 + 10 Gbps 83333 416.5 0.125 + + Table 10: Performance of a Scalable TCP connection. + + Table 10 shows the performance of a Scalable TCP connection with + 1500-byte packets, an RTT of 100 ms (including average queueing + delay), and no competing traffic. The TCP connection is assumed to + use delayed acknowledgements. The first column of Table 10 gives the + bandwidth, the second column gives the average congestion window + needed to utilize that bandwidth, and the third and fourth columns + give the increase and decrease parameters. + + Note that even in an environment with a 10 Mbps link, Scalable TCP's + behavior is considerably different from that of Standard TCP. The + increase parameter is smaller than that of Standard TCP, and the + decrease is smaller also, 1/8-th instead of 1/2. That is, for 10 + Mbps links, Scalable TCP increases less aggressively than Standard + TCP or HighSpeed TCP, but decreases less aggressively as well. + + In an environment with a 100 Mbps link, Scalable TCP has an increase + parameter of roughly four segments per round-trip time, with the same + decrease parameter of 1/8-th. A comparison of Tables 9 and 10 shows + that for this scenario of 100 Mbps links, HighSpeed TCP increases + more aggressively than Scalable TCP. + + Next we consider the relative fairness between Standard TCP, + HighSpeed TCP and Scalable TCP. The relative fairness between + HighSpeed TCP and Standard TCP was shown in Table 5 earlier in this + document, and the relative fairness between Scalable TCP and Standard + TCP was shown in Table 8. Following the approach in Section 6, for a + given packet drop rate p, for p < 10^-3, we can estimate the relative + fairness between Scalable and HighSpeed TCP as + W_Scalable/W_HighSpeed. This relative fairness is shown in Table 11 + below. The bandwidth in the last column of Table 11 is the aggregate + + + +Floyd Experimental [Page 22] + +RFC 3649 HighSpeed TCP December 2003 + + + bandwidth of the two competing flows given 100 ms round-trip times + and 1500-byte packets. + + Packet Drop Rate P Fairness Aggregate Window Bandwidth + ------------------ -------- ---------------- --------- + 10^-2 1.0 24 2.8 Mbps + 10^-3 1.0 76 9.1 Mbps + 10^-4 1.4 643 77.1 Mbps + 10^-5 2.1 5595 671.4 Mbps + 10^-6 3.1 50279 6.0 Gbps + 10^-7 4.5 463981 55.7 Gbps + + Table 11: Relative Fairness between the Scalable and HighSpeed + Response Functions. + + The second row of Table 11 shows that for a Scalable TCP and a + HighSpeed TCP flow competing in an environment with 100 ms RTTs and a + 10 Mbps pipe, the two flows would receive essentially the same + bandwidth. The next row shows that for a Scalable TCP and a + HighSpeed TCP flow competing in an environment with 100 ms RTTs and a + 100 Mbps pipe, the Scalable TCP flow would receive roughly 50% more + bandwidth than would HighSpeed TCP. Table 11 shows the relative + fairness in higher-bandwidth environments as well. This relative + fairness seems sufficient that there should be no problems with + Scalable TCP and HighSpeed TCP coexisting in the same environment as + Experimental variants of TCP. + + We note that one question that requires more investigation with + Scalable TCP is that of convergence to fairness in environments with + Drop-Tail queue management. + +12. Related Work in HighSpeed TCP + + HighSpeed TCP has been separately investigated in simulations by + Sylvia Ratnasamy and by Evandro de Souza [SA03]. The simulations in + [SA03] verify the fairness properties of HighSpeed TCP when sharing a + link with Standard TCP. + + These simulations explore the relative fairness of HighSpeed TCP + flows when competing with Standard TCP. The simulation environment + includes background forward and reverse-path TCP traffic limited by + the TCP receive window, along with a small amount of forward and + reverse-path traffic from the web traffic generator. Most of the + simulations so far explore performance on a simple dumbbell topology + with a 1 Gbps link with a propagation delay of 50 ms. Simulations + have been run with Adaptive RED and with DropTail queue management. + + + + + +Floyd Experimental [Page 23] + +RFC 3649 HighSpeed TCP December 2003 + + + The simulations in [SA03] explore performance with a varying number + of competing flows, with the competing traffic being all standard + TCP; all HighSpeed TCP; or a mix of standard and HighSpeed TCP. For + the simulations in [SA03] with RED queue management, the relative + fairness between standard and HighSpeed TCP is consistent with the + relative fairness predicted in Table 5. For the simulations with + Drop Tail queues, the relative fairness is more skewed, with the + HighSpeed TCP flows receiving an even larger share of the link + bandwidth. This is not surprising; with Active Queue Management at + the congested link, the fraction of packet drops received by each + flow should be roughly proportional to that flow's share of the link + bandwidth, while this property no longer holds with Drop Tail queue + management. We also note that relative fairness in simulations with + Drop Tail queue management can sometimes depend on small details of + the simulation scenario, and that Drop Tail simulations need special + care to avoid phase effects [F92]. + + [SA03] explores the bandwidth `stolen' by HighSpeed TCP from standard + TCP by exploring the fraction of the link bandwidth N standard TCP + flows receive when competing against N other standard TCP flows, and + comparing this to the fraction of the link bandwidth the N standard + TCP flows receive when competing against N HighSpeed TCP flows. For + the 1 Gbps simulation scenarios dominated by long-lived traffic, a + small number of standard TCP flows are able to achieve high link + utilization, and the HighSpeed TCP flows can be viewed as stealing + bandwidth from the competing standard TCP flows, as predicted in + Section 6 on the Fairness Implications of the HighSpeed Response + Function. However, [SA03] shows that when even a small fraction of + the link bandwidth is used by more bursty, short TCP connections, the + standard TCP flows are unable to achieve high link utilization, and + the HighSpeed TCP flows in this case are not `stealing' bandwidth + from the standard TCP flows, but instead are using bandwidth that + otherwise would not be utilized. + + The conclusions of [SA03] are that "HighSpeed TCP behaved as forseen + by its response function, and appears to be a real and viable option + for use on high-speed wide area TCP connections." + + Future work that could be explored in more detail includes + convergence times after new flows start-up; recovery time after a + transient outage; the response to sudden severe congestion, and + investigations of the potential for oscillations. We invite + contributions from others in this work. + + + + + + + + +Floyd Experimental [Page 24] + +RFC 3649 HighSpeed TCP December 2003 + + +13. Relationship to other Work + + Our assumption is that HighSpeed TCP will be used with the TCP SACK + option, and also with the increased Initial Window of three or four + segments, as allowed by [RFC3390]. For paths that have substantial + reordering, TCP performance would be greatly improved by some of the + mechanisms still in the research stages for robust performance in the + presence of reordered packets. + + Our view is that HighSpeed TCP is largely orthogonal to proposals for + higher PMTU (Path MTU) values [M02]. Unlike changes to the PMTU, + HighSpeed TCP does not require any changes in the network or at the + TCP receiver, and works well in the current Internet. Our assumption + is that HighSpeed TCP would be useful even with larger values for the + PMTU. Unlike the current congestion window, the PMTU gives no + information about the bandwidth-delay product available to that + particular flow. + + A related approach is that of a virtual MTU, where the actual MTU of + the path might be limited [VMSS,S02]. The virtual MTU approach has + not been fully investigated, and we do not explore the virtual MTU + approach further in this document. + +14. Conclusions + + This document has proposed HighSpeed TCP, a modification to TCP's + congestion control mechanism for use with TCP connections with large + congestion windows. We have explored this proposal in simulations, + and others have explored HighSpeed TCP with experiments, and we + believe HighSpeed TCP to be safe to deploy on the current Internet. + We would welcome additional analysis, simulations, and particularly, + experimentation. More information on simulations and experiments is + available from the HighSpeed TCP Web Page [HSTCP]. There are several + independent implementations of HighSpeed TCP [D02,F03] and of + Scalable TCP [K03] for further investigation. + +15. Acknowledgements + + The HighSpeed TCP proposal is from joint work with Sylvia Ratnasamy + and Scott Shenker (and was initiated by Scott Shenker). Additional + investigations of HighSpeed TCP were joint work with Evandro de Souza + and Deb Agarwal. We thank Tom Dunigan for the implementation in the + Linux 2.4.16 Web100 kernel, and for resulting experimentation with + HighSpeed TCP. We are grateful to the End-to-End Research Group, the + members of the Transport Area Working Group, and to members of the + IPAM program in Large Scale Communication Networks for feedback. We + thank Glenn Vinnicombe for framing the Linear response function in + the parameters of HighSpeed TCP. We are also grateful for + + + +Floyd Experimental [Page 25] + +RFC 3649 HighSpeed TCP December 2003 + + + contributions and feedback from the following individuals: Les + Cottrell, Mitchell Erblich, Jeffrey Hsu, Tom Kelly, Chuck Jackson, + Matt Mathis, Jitendra Padhye, Andrew Reiter, Stanislav Shalunov, Alex + Solan, Paul Sutter, Brian Tierney, Joe Touch. + +16. Normative References + + [RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion + Control", RFC 2581, April 1999. + +17. Informative References + + [ABLLS03] A. Antony, J. Blom, C. de Laat, J. Lee, and W. Sjouw, + "Microscopic Examination of TCP Flows over Transatlantic + Links", iGrid2002 special issue, Future Generation + Computer Systems, volume 19 issue 6 (2003), URL + "http://www.science.uva.nl/~delaat/techrep-2003-2- + tcp.pdf". + + [BBFS01] Deepak Bansal, Hari Balakrishnan, Sally Floyd, and Scott + Shenker, "Dynamic Behavior of Slowly-Responsive Congestion + Control Algorithms", SIGCOMM 2001, August 2001. + + [CC03] Fabrizio Coccetti and Les Cottrell, "TCP Stack + Measurements on Lightly Loaded Testbeds", 2003. URL + "http://www-iepm.slac.stanford.edu/monitoring/bulk/fast/". + + [CJ89] D. Chiu and R. Jain, "Analysis of the Increase and + Decrease Algorithms for Congestion Avoidance in Computer + Networks", Computer Networks and ISDN Systems, Vol. 17, + pp. 1-14, 1989. + + [CO98] J. Crowcroft and P. Oechslin, "Differentiated End-to-end + Services using a Weighted Proportional Fair Share TCP", + Computer Communication Review, 28(3):53--69, 1998. + + [D02] Tom Dunigan, "Floyd's TCP slow-start and AIMD mods", URL + "http://www.csm.ornl.gov/~dunigan/net100/floyd.html". + + [F03] Gareth Fairey, "High-Speed TCP", 2003. URL + "http://www.hep.man.ac.uk/u/garethf/hstcp/". + + [F92] S. Floyd and V. Jacobson, "On Traffic Phase Effects in + Packet-Switched Gateways, Internetworking: Research and + Experience", V.3 N.3, September 1992, p.115-156. URL + "http://www.icir.org/floyd/papers.html". + + + + + +Floyd Experimental [Page 26] + +RFC 3649 HighSpeed TCP December 2003 + + + [Fl03] Sally Floyd, "Re: [Tsvwg] taking NewReno (RFC 2582) to + Proposed Standard", Email to the tsvwg mailing list, May + 14, 2003. + + URLs "http://www1.ietf.org/mail-archive/working- + groups/tsvwg/current/msg04086.html" and + "http://www1.ietf.org/mail-archive/working- + groups/tsvwg/current/msg04087.html". + + [FF98] Floyd, S., and Fall, K., "Promoting the Use of End-to-End + Congestion Control in the Internet", IEEE/ACM Transactions + on Networking, August 1999. + + [FRS02] Sally Floyd, Sylvia Ratnasamy, and Scott Shenker, + "Modifying TCP's Congestion Control for High Speeds", May + 2002. URL "http://www.icir.org/floyd/notes.html". + + [GRK99] Panos Gevros, Fulvio Risso and Peter Kirstein, "Analysis + of a Method for Differential TCP Service". In Proceedings + of the IEEE GLOBECOM'99, Symposium on Global Internet , + December 1999, Rio de Janeiro, Brazil. + + [GV02] S. Gorinsky and H. Vin, "Extended Analysis of Binary + Adjustment Algorithms", Technical Report TR2002-39, + Department of Computer Sciences, The University of Texas + at Austin, August 2002. URL + "http://www.cs.utexas.edu/users/gorinsky/pubs.html". + + [HSTCP] HighSpeed TCP Web Page, URL + "http://www.icir.org/floyd/hstcp.html". + + [J02] Amit Jain and Sally Floyd, "Quick-Start for TCP and IP", + Work in Progress, 2002. + + [JWL03] Cheng Jin, David X. Wei and Steven H. Low, "FAST TCP for + High-speed Long-distance Networks", Work in Progress, June + 2003. + + [K03] Tom Kelly, "Scalable TCP: Improving Performance in + HighSpeed Wide Area Networks", February 2003. URL + "http://www-lce.eng.cam.ac.uk/~ctk21/scalable/". + + [KHR02] Dina Katabi, Mark Handley, and Charlie Rohrs, "Congestion + Control for High Bandwidth-Delay Product Networks", + SIGCOMM 2002. + + [M02] Matt Mathis, "Raising the Internet MTU", Web Page, URL + "http://www.psc.edu/~mathis/MTU/". + + + +Floyd Experimental [Page 27] + +RFC 3649 HighSpeed TCP December 2003 + + + [Net100] The DOE/MICS Net100 project. URL + "http://www.csm.ornl.gov/~dunigan/net100/". + + [NS] The NS Simulator, "http://www.isi.edu/nsnam/ns/". + + [RFC 1323] Jacobson, V., Braden, R. and D. Borman, "TCP Extensions + for High Performance", RFC 1323, May 1992. + + [RFC3390] Allman, M., Floyd, S. and C., Partridge, "Increasing TCP's + Initial Window", RFC 3390, October 2002. + + [RFC3448] Handley, M., Padhye, J., Floyd, S. and J. Widmer, "TCP + Friendly Rate Control (TFRC): Protocol Specification", RFC + 3448, January 2003. + + [SA03] Souza, E. and D.A., Agarwal, "A HighSpeed TCP Study: + Characteristics and Deployment Issues", LBNL Technical + Report LBNL-53215. URL + "http://www.icir.org/floyd/hstcp.html". + + [S02] Stanislav Shalunov, "TCP Armonk", Work in Progress, 2002, + URL "http://www.internet2.edu/~shalunov/tcpar/". + + [S03] Alex Solan, private communication, 2003. + + [VMSS] "Web100 at ORNL", Web Page, + "http://www.csm.ornl.gov/~dunigan/netperf/web100.html". + + [Web100] The Web100 project. URL "http://www.web100.org/". + +18. Security Considerations + + This proposal makes no changes to the underlying security of TCP. + +19. IANA Considerations + + There are no IANA considerations regarding this document. + + + + + + + + + + + + + + +Floyd Experimental [Page 28] + +RFC 3649 HighSpeed TCP December 2003 + + +A. TCP's Loss Event Rate in Steady-State + + This section gives the number of round-trip times between congestion + events for a TCP flow with D-byte packets, for D=1500, as a function + of the connection's average throughput B in bps. To achieve this + average throughput B, a TCP connection with round-trip time R in + seconds requires an average congestion window w of BR/(8D) segments. + + In steady-state, TCP's average congestion window w is roughly + 1.2/sqrt(p) segments. This is equivalent to a lost event at most + once every 1/p packets, or at most once every 1/(pw) = w/1.5 round- + trip times. Substituting for w, this is a loss event at most every + (BR)/12D)round-trip times. + + An an example, for R = 0.1 seconds and D = 1500 bytes, this gives + B/180000 round-trip times between loss events. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Floyd Experimental [Page 29] + +RFC 3649 HighSpeed TCP December 2003 + + +B. A table for a(w) and b(w). + + This section gives a table for the increase and decrease parameters + a(w) and b(w) for HighSpeed TCP, for the default values of Low_Window + = 38, High_Window = 83000, High_P = 10^-7, and High_Decrease = 0.1. + + w a(w) b(w) + ---- ---- ---- + 38 1 0.50 + 118 2 0.44 + 221 3 0.41 + 347 4 0.38 + 495 5 0.37 + 663 6 0.35 + 851 7 0.34 + 1058 8 0.33 + 1284 9 0.32 + 1529 10 0.31 + 1793 11 0.30 + 2076 12 0.29 + 2378 13 0.28 + 2699 14 0.28 + 3039 15 0.27 + 3399 16 0.27 + 3778 17 0.26 + 4177 18 0.26 + 4596 19 0.25 + 5036 20 0.25 + 5497 21 0.24 + 5979 22 0.24 + 6483 23 0.23 + 7009 24 0.23 + 7558 25 0.22 + 8130 26 0.22 + 8726 27 0.22 + 9346 28 0.21 + 9991 29 0.21 + 10661 30 0.21 + 11358 31 0.20 + 12082 32 0.20 + 12834 33 0.20 + 13614 34 0.19 + 14424 35 0.19 + 15265 36 0.19 + 16137 37 0.19 + 17042 38 0.18 + 17981 39 0.18 + 18955 40 0.18 + + + +Floyd Experimental [Page 30] + +RFC 3649 HighSpeed TCP December 2003 + + + 19965 41 0.17 + 21013 42 0.17 + 22101 43 0.17 + 23230 44 0.17 + 24402 45 0.16 + 25618 46 0.16 + 26881 47 0.16 + 28193 48 0.16 + 29557 49 0.15 + 30975 50 0.15 + 32450 51 0.15 + 33986 52 0.15 + 35586 53 0.14 + 37253 54 0.14 + 38992 55 0.14 + 40808 56 0.14 + 42707 57 0.13 + 44694 58 0.13 + 46776 59 0.13 + 48961 60 0.13 + 51258 61 0.13 + 53677 62 0.12 + 56230 63 0.12 + 58932 64 0.12 + 61799 65 0.12 + 64851 66 0.11 + 68113 67 0.11 + 71617 68 0.11 + 75401 69 0.10 + 79517 70 0.10 + 84035 71 0.10 + 89053 72 0.10 + 94717 73 0.09 + + Table 12: Parameters for HighSpeed TCP. + + + + + + + + + + + + + + + + +Floyd Experimental [Page 31] + +RFC 3649 HighSpeed TCP December 2003 + + + This table was computed with the following Perl program: + + $top = 100000; + $num = 38; + if ($num == 38) { + print " w a(w) b(w)\n"; + print " ---- ---- ----\n"; + print " 38 1 0.50\n"; + $oldb = 0.50; + $olda = 1; + } + while ($num < $top) { + $bw = (0.1 -0.5)*(log($num)-log(38))/(log(83000)-log(38))+0.5; + $aw = ($num**2*2.0*$bw) / ((2.0-$bw)*$num**1.2*12.8); + if ($aw > $olda + 1) { + printf "%6d %5d %3.2f0, $num, $aw, $bw; + $olda = $aw; + } + $num ++; + } + + Table 13: Perl Program for computing parameters for HighSpeed TCP. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Floyd Experimental [Page 32] + +RFC 3649 HighSpeed TCP December 2003 + + +C. Exploring the time to converge to fairness. + + This section gives the Perl program used to compute the congestion + window growth during congestion avoidance. + + $top = 2001; + $hswin = 1; + $regwin = 1; + $rtt = 1; + $lastrtt = 0; + $rttstep = 100; + if ($hswin == 1) { + print " RTT HS_Window Standard_TCP_Window0; + print " --- --------- -------------------0; + } + while ($rtt < $top) { + $bw = (0.1 -0.5)*(log($hswin)-log(38))/(log(83000)-log(38))+0.5; + $aw = ($hswin**2*2.0*$bw) / ((2.0-$bw)*$hswin**1.2*12.8); + if ($aw < 1) { + $aw = 1; + } + if ($rtt >= $lastrtt + $rttstep) { + printf "%5d %9d %10d0, $rtt, $hswin, $regwin; + $lastrtt = $rtt; + } + $hswin += $aw; + $regwin += 1; + $rtt ++; + } + + Table 14: Perl Program for computing the window in congestion + avoidance. + +Author's Address + + Sally Floyd + ICIR (ICSI Center for Internet Research) + + Phone: +1 (510) 666-2989 + EMail: floyd@acm.org + URL: http://www.icir.org/floyd/ + + + + + + + + + + +Floyd Experimental [Page 33] + +RFC 3649 HighSpeed TCP December 2003 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2003). All Rights Reserved. + + This document and translations of it may be copied and furnished to + others, and derivative works that comment on or otherwise explain it + or assist in its implementation may be prepared, copied, published + and distributed, in whole or in part, without restriction of any + kind, provided that the above copyright notice and this paragraph are + included on all such copies and derivative works. However, this + document itself may not be modified in any way, such as by removing + the copyright notice or references to the Internet Society or other + Internet organizations, except as needed for the purpose of + developing Internet standards in which case the procedures for + copyrights defined in the Internet Standards process must be + followed, or as required to translate it into languages other than + English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assignees. + + This document and the information contained herein is provided on an + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + + + + + + + + + + + +Floyd Experimental [Page 34] + diff --git a/ext/picotcp/RFC/rfc3819.txt b/ext/picotcp/RFC/rfc3819.txt new file mode 100644 index 0000000..b4a5e8b --- /dev/null +++ b/ext/picotcp/RFC/rfc3819.txt @@ -0,0 +1,3363 @@ + + + + + + +Network Working Group P. Karn, Ed. +Request for Comments: 3819 Qualcomm +BCP: 89 C. Bormann +Category: Best Current Practice Universitaet Bremen TZI + G. Fairhurst + University of Aberdeen + D. Grossman + Motorola, Inc. + R. Ludwig + Ericsson Research + J. Mahdavi + Novell + G. Montenegro + Sun Microsystems Laboratories, Europe + J. Touch + USC/ISI + L. Wood + Cisco Systems + July 2004 + + + Advice for Internet Subnetwork Designers + +Status of this Memo + + This document specifies an Internet Best Current Practices for the + Internet Community, and requests discussion and suggestions for + improvements. Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2004). + +Abstract + + This document provides advice to the designers of digital + communication equipment, link-layer protocols, and packet-switched + local networks (collectively referred to as subnetworks), who wish to + support the Internet protocols but may be unfamiliar with the + Internet architecture and the implications of their design choices on + the performance and efficiency of the Internet. + + + + + + + + + + +Karn, et al. Best Current Practice [Page 1] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +Table of Contents + + 1. Introduction and Overview. . . . . . . . . . . . . . . . . . . 2 + 2. Maximum Transmission Units (MTUs) and IP Fragmentation . . . . 4 + 2.1. Choosing the MTU in Slow Networks. . . . . . . . . . . . 6 + 3. Framing on Connection-Oriented Subnetworks . . . . . . . . . . 7 + 4. Connection-Oriented Subnetworks. . . . . . . . . . . . . . . . 9 + 5. Broadcasting and Discovery . . . . . . . . . . . . . . . . . . 10 + 6. Multicasting . . . . . . . . . . . . . . . . . . . . . . . . . 11 + 7. Bandwidth on Demand (BoD) Subnets. . . . . . . . . . . . . . . 13 + 8. Reliability and Error Control. . . . . . . . . . . . . . . . . 14 + 8.1. TCP vs Link-Layer Retransmission . . . . . . . . . . . . 14 + 8.2. Recovery from Subnetwork Outages . . . . . . . . . . . . 17 + 8.3. CRCs, Checksums and Error Detection. . . . . . . . . . . 18 + 8.4. How TCP Works. . . . . . . . . . . . . . . . . . . . . . 20 + 8.5. TCP Performance Characteristics. . . . . . . . . . . . . 22 + 8.5.1. The Formulae . . . . . . . . . . . . . . . . . . 22 + 8.5.2. Assumptions. . . . . . . . . . . . . . . . . . . 23 + 8.5.3. Analysis of Link-Layer Effects on TCP + Performance. . . . . . . . . . . . . . . . . . . 24 + 9. Quality-of-Service (QoS) Considerations. . . . . . . . . . . . 26 + 10. Fairness vs Performance. . . . . . . . . . . . . . . . . . . . 29 + 11. Delay Characteristics. . . . . . . . . . . . . . . . . . . . . 30 + 12. Bandwidth Asymmetries. . . . . . . . . . . . . . . . . . . . . 31 + 13. Buffering, Flow and Congestion Control . . . . . . . . . . . . 31 + 14. Compression. . . . . . . . . . . . . . . . . . . . . . . . . . 34 + 15. Packet Reordering. . . . . . . . . . . . . . . . . . . . . . . 36 + 16. Mobility . . . . . . . . . . . . . . . . . . . . . . . . . . . 37 + 17. Routing. . . . . . . . . . . . . . . . . . . . . . . . . . . . 39 + 18. Security Considerations. . . . . . . . . . . . . . . . . . . . 41 + 19. Contributors . . . . . . . . . . . . . . . . . . . . . . . . . 44 + 20. Informative References . . . . . . . . . . . . . . . . . . . . 45 + 21. Contributors' Addresses. . . . . . . . . . . . . . . . . . . . 57 + 22. Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . 58 + 23. Full Copyright Statement . . . . . . . . . . . . . . . . . . . 60 + +1. Introduction and Overview + + IP, the Internet Protocol [RFC791] [RFC2460], is the core protocol of + the Internet. IP defines a simple "connectionless" packet-switched + network. The success of the Internet is largely attributed to IP's + simplicity, the "end-to-end principle" [SRC81] on which the Internet + is based, and the resulting ease of carrying IP on a wide variety of + subnetworks, not necessarily designed with IP in mind. A subnetwork + refers to any network operating immediately below the IP layer to + connect two or more systems using IP (i.e., end hosts or routers). + In its simplest form, this may be a direct connection between the IP + systems (e.g., using a length of cable or a wireless medium). + + + +Karn, et al. Best Current Practice [Page 2] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + This document defines a subnetwork as a layer 2 network, which is a + network that does not rely upon the services of IP routers to forward + packets between parts of the subnetwork. However, IP routers may + bridge frames at Layer 2 between parts of a subnetwork. Sometimes, + it is convenient to aggregate a group of such subnetworks into a + single logical subnetwork. IP routing protocols (e.g., OSPF, IS-IS, + and PIM) can be configured to support this aggregation, but typically + present a layer-3 subnetwork rather than a layer-2 subnetwork. This + may also result in a specific packet passing several times over the + same layer-2 subnetwork via an intermediate layer-3 gateway (router). + Because that aggregation requires layer-3 components, issues thereof + are beyond the scope of this document. + + However, while many subnetworks carry IP, they do not necessarily do + so with maximum efficiency, minimum complexity, or cost, nor do they + implement certain features to efficiently support newer Internet + features of increasing importance, such as multicasting or quality of + service. + + With the explosive growth of the Internet, IP packets comprise an + increasingly large fraction of the traffic carried by the world's + telecommunications networks. It therefore makes sense to optimize + both existing and new subnetwork technologies for IP as much as + possible. + + Optimizing a subnetwork for IP involves three complementary + considerations: + + 1. Providing functionality sufficient to carry IP. + + 2. Eliminating unnecessary functions that increase cost or + complexity. + + 3. Choosing subnetwork parameters that maximize the performance of + the Internet protocols. + + Because IP is so simple, consideration 2 is more of an issue than + consideration 1. That is to say, subnetwork designers make many more + errors of commission than errors of omission. However, certain + enhancements to Internet features, such as multicasting and quality- + of-service, benefit significantly from support given by the + underlying subnetworks beyond that necessary to carry "traditional" + unicast, best-effort IP. + + + + + + + + +Karn, et al. Best Current Practice [Page 3] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + A major consideration in the efficient design of any layered + communication network is the appropriate layer(s) in which to + implement a given function. This issue was first addressed in the + seminal paper, "End-to-End Arguments in System Design" [SRC81]. That + paper argued that many functions can be implemented properly *only* + on an end-to-end basis, i.e., at the highest protocol layers, outside + the subnetwork. These functions include ensuring the reliable + delivery of data and the use of cryptography to provide + confidentiality and message integrity. + + Such functions cannot be provided solely by the concatenation of + hop-by-hop services; duplicating these functions at the lower + protocol layers (i.e., within the subnetwork) can be needlessly + redundant or even harmful to cost and performance. + + However, partial duplication of functionality in a lower layer can + *sometimes* be justified by performance, security, or availability + considerations. Examples include link-layer retransmission to + improve the performance of an unusually lossy channel, e.g., mobile + radio, link-level encryption intended to thwart traffic analysis, and + redundant transmission links to improve availability, increase + throughput, or to guarantee performance for certain classes of + traffic. Duplication of protocol functions should be done only with + an understanding of system-level implications, including possible + interactions with higher-layer mechanisms. + + The original architecture of the Internet was influenced by the + end-to-end principle [SRC81], and has been, in our view, part of the + reason for the Internet's success. + + The remainder of this document discusses the various subnetwork + design issues that the authors consider relevant to efficient IP + support. + +2. Maximum Transmission Units (MTUs) and IP Fragmentation + + IPv4 packets (datagrams) vary in size, from 20 bytes (the size of the + IPv4 header alone) to a maximum of 65535 bytes. Subnetworks need not + support maximum-sized (64KB) IP packets, as IP provides a scheme that + breaks packets that are too large for a given subnetwork into + fragments that travel as independent IP packets and are reassembled + at the destination. The maximum packet size supported by a + subnetwork is known as its Maximum Transmission Unit (MTU). + + Subnetworks may, but are not required to, indicate the length of each + packet they carry. One example is Ethernet with the widely used DIX + [DIX82] (not IEEE 802.3 [IEEE8023]) header, which lacks a length + + + + +Karn, et al. Best Current Practice [Page 4] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + field to indicate the true data length when the packet is padded to a + minimum of 60 bytes. This is not a problem for uncompressed IP + because each IP packet carries its own length field. + + If optional header compression [RFC1144] [RFC2507] [RFC2508] + [RFC3095] is used, however, it is required that the link framing + indicate frame length because that is needed for the reconstruction + of the original header. + + In IP version 4 (the version now in widespread use), fragmentation + can occur at either the sending host or in an intermediate router, + and fragments can be further fragmented at subsequent routers if + necessary. + + In IP version 6 [RFC2460], fragmentation can occur only at the + sending host; it cannot occur in a router (called "router + fragmentation" in this document). + + Both IPv4 and IPv6 provide a "path MTU discovery" procedure [RFC1191] + [RFC1435] [RFC1981] that allows the sending host to avoid + fragmentation by discovering the minimum MTU along a given path and + reduce its packet sizes accordingly. This procedure is optional in + IPv4 and IPv6. + + Path MTU discovery is widely deployed, but it sometimes encounters + problems. Some routers fail to generate the ICMP messages that + convey path MTU information to the sender, and sometimes the ICMP + messages are blocked by overly restrictive firewalls. The result can + be a "Path MTU Black Hole" [RFC2923] [RFC1435]. + + The Path MTU Discovery procedure, the persistence of path MTU black + holes, and the deletion of router fragmentation in IPv6 reflect a + consensus of the Internet technical community that router + fragmentation is best avoided. This requires that subnetworks + support MTUs that are "reasonably" large. All IPv4 end hosts are + required to accept and reassemble IP packets of size 576 bytes + [RFC791], but such a small value would clearly be inefficient. + Because IPv6 omits fragmentation by routers, [RFC2460] specifies a + larger minimum MTU of 1280 bytes. Any subnetwork with an internal + packet payload smaller than 1280 bytes must implement a mechanism + that performs fragmentation/reassembly of IP packets to/from + subnetwork frames if it is to support IPv6. + + If a subnetwork cannot directly support a "reasonable" MTU with + native framing mechanisms, it should internally fragment. That is, + it should transparently break IP packets into internal data elements + and reassemble them at the other end of the subnetwork. + + + + +Karn, et al. Best Current Practice [Page 5] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + This leaves the question of what is a "reasonable" MTU. Ethernet (10 + and 100 Mb/s) has an MTU of 1500 bytes, and because of the ubiquity + of Ethernet few Internet paths currently have MTUs larger than this + value. This severely limits the utility of larger MTUs provided by + other subnetworks. Meanwhile, larger MTUs are increasingly desirable + on high-speed subnetworks to reduce the per-packet processing + overhead in host computers, and implementers are encouraged to + provide them even though they may not be usable when Ethernet is also + in the path. + + Various "tunneling" schemes, such as GRE [RFC2784] or IP Security in + tunnel mode [RFC2406], treat IP as a subnetwork for IP. Since + tunneling adds header overhead, it can trigger fragmentation, even + when the same physical subnetworks (e.g., Ethernet) are used on both + sides of the host performing IPsec encapsulation. Tunneling has made + it more difficult to avoid router fragmentation and has increased the + incidence of path MTU black holes [RFC2401] [RFC2923]. Larger + subnetwork MTUs may help to alleviate this problem. + +2.1. Choosing the MTU in Slow Networks + + In slow networks, the largest possible packet may take a considerable + amount of time to send. This is known as channelisation or + serialisation delay. Total end-to-end interactive response time + should not exceed the well-known human factors limit of 100 to 200 + ms. This includes all sources of delay: electromagnetic propagation + delay, queuing delay, serialisation delay, and the store-and-forward + time, i.e., the time to transmit a packet at link speed. + + At low link speeds, store-and-forward delays can dominate total + end-to-end delay; these are in turn directly influenced by the + maximum transmission unit (MTU) size. Even when an interactive + packet is given a higher queuing priority, it may have to wait for a + large bulk transfer packet to finish transmission. This worst-case + wait can be set by an appropriate choice of MTU. + + For example, if the MTU is set to 1500 bytes, then an MTU-sized + packet will take about 8 milliseconds to send on a T1 (1.536 Mb/s) + link. But if the link speed is 19.2kb/s, then the transmission time + becomes 625 ms -- well above our 100-200ms limit. A 256-byte MTU + would lower this delay to a little over 100 ms. However, care should + be taken not to lower the MTU excessively, as this will increase + header overhead and trigger frequent router fragmentation (if Path + MTU discovery is not in use). This is likely to be the case with + multicast, where Path MTU discovery is ineffective. + + One way to limit delay for interactive traffic without imposing a + small MTU is to give priority to this traffic and to preempt (abort) + + + +Karn, et al. Best Current Practice [Page 6] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + the transmission of a lower-priority packet when a higher priority + packet arrives in the queue. However, the link resources used to + send the aborted packet are lost, and overall throughput will + decrease. + + Another way to limit delay is to implement a link-level multiplexing + scheme that allows several packets to be in progress simultaneously, + with transmission priority given to segments of higher-priority IP + packets. For links using the Point-To-Point Protocol (PPP) + [RFC1661], multi-class multilink [RFC2686] [RFC2687] [RFC2689] + provides such a facility. + + ATM (asynchronous transfer mode), where SNDUs are fragmented and + interleaved across smaller 53-byte ATM cells, is another example of + this technique. However, ATM is generally used on high-speed links + where the store-and-forward delays are already minimal, and it + introduces significant (~9%) increases in overhead due to the + addition of 5-byte cell overhead to each 48-byte ATM cell. + + A third example is the Data-Over-Cable Service Interface + Specification (DOCSIS) with typical upstream bandwidths of 2.56 Mb/s + or 5.12 Mb/s. To reduce the impact of a 1500-byte MTU in DOCSIS 1.0 + [DOCSIS1], a data link layer fragmentation mechanism is specified in + DOCSIS 1.1 [DOCSIS2]. To accommodate the installed base, DOCSIS 1.1 + must be backward compatible with DOCSIS 1.0 cable modems, which + generally do not support fragmentation. Under the co-existence of + DOCSIS 1.0 and DOCSIS 1.1, the unfragmented large data packets from + DOCSIS 1.0 cable modems may affect the quality of service for voice + packets from DOCSIS 1.1 cable modems. In this case, it has been + shown in [DOCSIS3] that the use of bandwidth allocation algorithms + can mitigate this effect. + + To summarize, there is a fundamental tradeoff between efficiency and + latency in the design of a subnetwork, and the designer should keep + this tradeoff in mind. + +3. Framing on Connection-Oriented Subnetworks + + IP requires that subnetworks mark the beginning and end of each + variable-length, asynchronous IP packet. Some examples of links and + subnetworks that do not provide this as an intrinsic feature include: + + 1. leased lines carrying a synchronous bit stream; + + 2. ISDN B-channels carrying a synchronous octet stream; + + 3. dialup telephone modems carrying an asynchronous octet stream; + + + + +Karn, et al. Best Current Practice [Page 7] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + and + + 4. Asynchronous Transfer Mode (ATM) networks carrying an + asynchronous stream of fixed-sized "cells". + + The Internet community has defined packet framing methods for all + these subnetworks. The Point-To-Point Protocol (PPP) [RFC1661], + which uses a variant of HDLC, is applicable to bit synchronous, + octet-synchronous, and octet asynchronous links (i.e., examples 1-3 + above). PPP is one preferred framing method for IP, since a large + number of systems interoperate with PPP. ATM has its own framing + methods, described in [RFC2684] [RFC2364]. + + At high speeds, a subnetwork should provide a framed interface + capable of carrying asynchronous, variable-length IP datagrams. The + maximum packet size supported by this interface is discussed above in + the MTU/Fragmentation section. The subnetwork may implement this + facility in any convenient manner. + + IP packet boundaries need not coincide with any framing or + synchronization mechanisms internal to the subnetwork. When the + subnetwork implements variable sized data units, the most + straightforward approach is to place exactly one IP packet into each + subnetwork data unit (SNDU), and to rely on the subnetwork's existing + ability to delimit SNDUs to also delimit IP packets. A good example + is Ethernet. However, some subnetworks have SNDUs of one or more + fixed sizes, as dictated by switching, forward error correction + and/or interleaving considerations. Examples of such subnetworks + include ATM, with a single cell payload size of 48 octets plus a 5- + octet header, and IS-95 digital cellular, with two "rate sets" of + four fixed frame sizes each that may be selected on 20 millisecond + boundaries. + + Because IP packets are of variable length, they may not necessarily + fit into an integer multiple of fixed-sized SNDUs. An "adaptation + layer" is needed to convert IP packets into SNDUs while marking the + boundary between each IP packet in some manner. + + There are several approaches to this problem. The first is to encode + each IP packet into one or more SNDUs with no SNDU containing pieces + of more than one IP packet, and to pad out the last SNDU of the + packet as needed. Bits in a control header added to each SNDU + indicate where the data segment belongs in the IP packet. If the + subnetwork provides in-order, at-most-once delivery, the header can + be as simple as a pair of bits indicating whether the SNDU is the + first and/or the last in the IP packet. Alternatively, for + subnetworks that do not reorder the fragments of an SNDU, only the + last SNDU of the packet could be marked, as this would implicitly + + + +Karn, et al. Best Current Practice [Page 8] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + indicate the next SNDU as the first in a new IP packet. The AAL5 + (ATM Adaptation Layer 5) scheme used with ATM is an example of this + approach, though it adds other features, including a payload length + field and a payload CRC. + + In AAL5, the ATM User-User Indication, which is encoded in the + Payload Type field of an ATM cell, indicates the last cell of a + packet. The packet trailer is located at the end of the SNDU and + contains the packet length and a CRC. + + Another framing technique is to insert per-segment overhead to + indicate the presence of a segment option. When present, the option + carries a pointer to the end of the packet. This differs from AAL5 + in that it permits another packet to follow within the same segment. + MPEG-2 Transport Streams [EN301192] [ISO13818] support this style of + fragmentation, and may either use padding (limiting each MPEG + transport stream packet to carry only part of one IP packet), or + allow a second IP packet to start in the same Transport Stream packet + (no padding). + + A third approach is to insert a special flag sequence into the data + stream between each IP packet, and to pack the resulting data stream + into SNDUs without regard to SNDU boundaries. This may have + implications when frames are lost. The flag sequence can also pad + unused space at the end of an SNDU. If the special flag appears in + the user data, it is escaped to an alternate sequence (usually larger + than a flag) to avoid being misinterpreted as a flag. The HDLC-based + framing schemes used in PPP are all examples of this approach. + + All three adaptation schemes introduce overhead; how much depends on + the distribution of IP packet sizes, the size(s) of the SNDUs, and in + the HDLC-like approaches, the content of the IP packet (since flag- + like sequences occurring in the packet must be escaped, which expands + them). The designer must also weigh implementation complexity and + performance in the choice and design of an adaptation layer. + +4. Connection-Oriented Subnetworks + + IP has no notion of a "connection"; it is a purely connectionless + protocol. When a connection is required by an application, it is + usually provided by TCP [RFC793], the Transmission Control Protocol, + running atop IP on an end-to-end basis. + + Connection-oriented subnetworks can be (and are widely) used to carry + IP, but often with considerable complexity. Subnetworks consisting + of few nodes can simply open a permanent connection between each pair + of nodes. This is frequently done with ATM. However, the number of + connections increases as the square of the number of nodes, so this + + + +Karn, et al. Best Current Practice [Page 9] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + is clearly impractical for large subnetworks. A "shim" layer between + IP and the subnetwork is therefore required to manage connections. + This is one of the most common functions of a Subnetwork Dependent + Convergence Function (SNDCF) sublayer between IP and a subnetwork. + + SNDCFs typically open subnetwork connections as needed when an IP + packet is queued for transmission and close them after an idle + timeout. There is no relation between subnetwork connections and any + connections that may exist at higher layers (e.g., TCP). + + Because Internet traffic is typically bursty and transaction- + oriented, it is often difficult to pick an optimal idle timeout. If + the timeout is too short, subnetwork connections are opened and + closed rapidly, possibly over-stressing the subnetwork connection + management system (especially if it was designed for voice traffic + call holding times). If the timeout is too long, subnetwork + connections are idle much of the time, wasting any resources + dedicated to them by the subnetwork. + + Purely connectionless subnets (such as Ethernet), which have no state + and dynamically share resources, are optimal for supporting best- + effort IP, which is stateless and dynamically shares resources. + Connection-oriented packet networks (such as ATM and Frame Relay), + which have state and dynamically share resources, are less optimal, + since best-effort IP does not benefit from the overhead of creating + and maintaining state. Connection-oriented circuit-switched networks + (including the PSTN and ISDN) have state and statically allocate + resources for a call, and thus require state creation and maintenance + overhead, but do not benefit from the efficiencies of statistical + multiplexing sharing of capacity inherent in IP. + + In any event, if an SNDCF that opens and closes subnet connections is + used to support IP, care should be taken to make sure that connection + processing in the subnet can keep up with relatively short holding + times. + +5. Broadcasting and Discovery + + Subnetworks fall into two categories: point-to-point and shared. A + point-to-point subnet has exactly two endpoint components (hosts or + routers); a shared link has more than two endpoint components, using + either an inherently broadcast medium (e.g., Ethernet, radio) or a + switching layer hidden from the network layer (e.g., switched + Ethernet, Myrinet [MYR95], ATM). Switched subnetworks handle + broadcast by copying broadcast packets, providing each interface that + supports one, or more, systems (hosts or routers) with a copy of each + packet. + + + + +Karn, et al. Best Current Practice [Page 10] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + Several Internet protocols for IPv4 make use of broadcast + capabilities, including link-layer address lookup (ARP), auto- + configuration (RARP, BOOTP, DHCP), and routing (RIP). + + A lack of broadcast capability can impede the performance of these + protocols, or render them inoperable (e.g., DHCP). ARP-like link + address lookup can be provided by a centralized database, but at the + expense of potentially higher response latency and the need for nodes + to have explicit knowledge of the ARP server address. Shared links + should support native, link-layer subnet broadcast. + + A corresponding set of IPv6 protocols uses multicasting (see next + section) instead of broadcasting to provide similar functions with + improved scaling in large networks. + +6. Multicasting + + The Internet model includes "multicasting", where IP packets are sent + to all the members of a multicast group [RFC1112] [RFC3376] + [RFC2710]. Multicast is an option in IPv4, but a standard feature of + IPv6. IPv4 multicast is currently used by multimedia, + teleconferencing, gaming, and file distribution (web, peer-to-peer + sharing) applications, as well as by some key network and host + protocols (e.g., RIPv2, OSPF, NTP). IPv6 additionally relies on + multicast for network configuration (DHCP-like autoconfiguration) and + link-layer address discovery [RFC2461] (replacing ARP). In the case + of IPv6, this can allow autoconfiguration and address discovery to + span across routers, whereas the IPv4 broadcast-based services cannot + without ad-hoc router support [RFC1812]. + + Multicast-enabled IP routers organize each multicast group into a + spanning tree, and route multicast packets by making copies of each + multicast packet and forwarding the copies to each output interface + that includes at least one downstream member of the multicast group. + + Multicasting is considerably more efficient when a subnetwork + explicitly supports it. For example, a router relaying a multicast + packet onto an Ethernet segment need send only one copy of the + packet, no matter how many members of the multicast group are + connected to the segment. Without native multicast support, routers + and switches on shared links would need to use broadcast with + software filters, such that every multicast packet sent incurs + software overhead for every node on the subnetwork, even if a node is + not a member of the multicast group. Alternately, the router would + transmit a separate copy to every member of the multicast group on + the segment, as is done on multicast-incapable switched subnets. + + + + + +Karn, et al. Best Current Practice [Page 11] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + Subnetworks using shared channels (e.g., radio LANs, Ethernets) are + especially suitable for native multicasting, and their designers + should make every effort to support it. This involves designating a + section of the subnetwork's own address space for multicasting. On + these networks, multicast is basically broadcast on the medium, with + Layer-2 receiver filters. + + Subnet interfaces also need to be designed to accept packets + addressed to some number of multicast addresses, in addition to the + unicast packets specifically addressed to them. The number of + multicast addresses that needs to be supported by a host depends on + the requirements of the associated host; at least several dozen will + meet most current needs. + + On low-speed networks, the multicast address recognition function may + be readily implemented in host software, but on high-speed networks, + it should be implemented in subnetwork hardware. This hardware need + not be complete; for example, many Ethernet interfaces implement a + "hashing" function where the IP layer receives all of the multicast + (and unicast) traffic to which the associated host subscribes, plus + some small fraction of multicast traffic to which the host does not + subscribe. Host/router software then has to discard the unwanted + packets that pass the Layer-2 multicast address filter [RFC1112]. + + There does not need to be a one-to-one mapping between a Layer-2 + multicast address and an IP multicast address. An address overlap + may significantly degrade the filtering capability of a receiver's + hardware multicast address filter. A subnetwork supporting only + broadcast should use this service for multicast and must rely on + software filtering. + + Switched subnetworks must also provide a mechanism for copying + multicast packets to ensure the packets reach at least all members of + a multicast group. One option is to "flood" multicast packets in the + same manner as broadcast. This can lead to unnecessary transmissions + on some subnetwork links (notably non-multicast-aware Ethernet + switches). Some subnetworks therefore allow multicast filter tables + to control which links receive packets belonging to a specific group. + To configure this automatically requires access to Layer-3 group + membership information (e.g., IGMP [RFC3376], or MLD [RFC2710]). + Various implementation options currently exist to provide a subnet + node with a list of mappings of multicast addresses to + ports/interfaces. These employ a range of approaches, including + signaling from end hosts (e.g., IEEE 802 GARP/GMRP [802.1p]), + signaling from switches (e.g., CGMP [CGMP] and RGMP [RFC3488]), + interception and proxy of IP group membership packets (e.g., IGMP/MLD + Proxy [MAGMA-PROXY]), and enabling Layer-2 devices to + snoop/inspect/peek into forwarded Layer-3 protocol headers (e.g., + + + +Karn, et al. Best Current Practice [Page 12] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + IGMP, MLD, PIM) so that they may infer Layer-3 multicast group + membership [MAGMA-SNOOP]. These approaches differ in their + complexity, flexibility, and ability to support new protocols. + +7. Bandwidth on Demand (BoD) Subnets + + Some subnets allow a number of subnet nodes to share a channel + efficiently by assigning transmission opportunities dynamically. + Transmission opportunities are requested by a subnet node when it has + packets to send. The subnet schedules and grants transmission + opportunities sufficient to allow the transmitting subnet node to + send one or more packets (or packet fragments). We call these + subnets Bandwidth on Demand (BoD) subnets. Examples of BoD subnets + include Demand Assignment Multiple Access (DAMA) satellite and + terrestrial wireless networks, IEEE 802.11 point coordination + function (PCF) mode, and DOCSIS. A connection-oriented network (such + as the PSTN, ATM or Frame Relay) reserves resources on a much longer + timescale, and is therefore not a BoD subnet in our taxonomy. + + The design parameters for BoD are similar to those in connection- + oriented subnetworks, although the implementations may vary + significantly. In BoD, the user typically requests access to the + shared channel for some duration. Access may be allocated for a + period of time at a specific rate, for a certain number of packets, + or until the user releases the channel. Access may be coordinated + through a central management entity or with a distributed algorithm + amongst the users. Examples of the resource that may be shared + include a terrestrial wireless hop, an upstream channel in a cable + television system, a satellite uplink, and an end-to-end satellite + channel. + + Long-delay BoD subnets pose problems similar to connection-oriented + subnets in anticipating traffic. While connection-oriented subnets + hold idle channels open expecting new data to arrive, BoD subnets + request channel access based on buffer occupancy (or expected buffer + occupancy) on the sending port. Poor performance will likely result + if the sender does not anticipate additional traffic arriving at that + port during the time it takes to grant a transmission request. It is + recommended that the algorithm have the capability to extend a hold + on the channel for data that has arrived after the original request + was generated (this may be done by piggybacking new requests on user + data). + + There is a wide variety of BoD protocols available. However, there + has been relatively little comprehensive research on the interactions + between BoD mechanisms and Internet protocol performance. Research + on some specific mechanisms is available (e.g., [AR02]). One item + that has been studied is TCP's retransmission timer [KY02]. BoD + + + +Karn, et al. Best Current Practice [Page 13] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + systems can cause spurious timeouts when adjusting from a relatively + high data rate, to a relatively low data rate. In this case, TCP's + transmitted data takes longer to get through the network than + predicted by the TCP sender's computed retransmission timeout. + Therefore, the TCP sender is prone to resending a segment + prematurely. + +8. Reliability and Error Control + + In the Internet architecture, the ultimate responsibility for error + recovery is at the end points [SRC81]. The Internet may occasionally + drop, corrupt, duplicate, or reorder packets, and the transport + protocol (e.g., TCP) or application (e.g., if UDP is used as the + transport protocol) must recover from these errors on an end-to-end + basis [RFC3155]. Error recovery in the subnetwork is therefore + justifiable only to the extent that it can enhance overall + performance. It is important to recognize that a subnetwork can go + too far in attempting to provide error recovery services in the + Internet environment. Subnet reliability should be "lightweight", + i.e., it only has to be "good enough", *not* perfect. + + In this section, we discuss how to analyze characteristics of a + subnetwork to determine what is "good enough". The discussion below + focuses on TCP, which is the most widely-used transport protocol in + the Internet. It is widely believed (and is a stated goal within the + IETF) that non-TCP transport protocols should attempt to be "TCP- + friendly" and have many of the same performance characteristics. + Thus, the discussion below should be applicable, even to portions of + the Internet where TCP may not be the predominant protocol. + +8.1. TCP vs Link-Layer Retransmission + + Error recovery involves the generation and transmission of redundant + information computed from user data. Depending on how much redundant + information is sent and how it is generated, the receiver can use it + to reliably detect transmission errors, correct up to some maximum + number of transmission errors, or both. The general approach is + known as Error Control Coding, or ECC. + + The use of ECC to detect transmission errors so that retransmissions + (hopefully without errors) can be requested is widely known as "ARQ" + (Automatic Repeat Request). + + When enough ECC information is available to permit the receiver to + correct some transmission errors without a retransmission, the + approach is known as Forward Error Correction (FEC). Due to the + greater complexity of the required ECC and the need to tailor its + design to the characteristics of a specific modem and channel, FEC + + + +Karn, et al. Best Current Practice [Page 14] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + has traditionally been implemented in special-purpose hardware + integral to a modem. This effectively makes it part of the physical + layer. + + Unlike ARQ, FEC was rarely used for telecommunications outside of + space links prior to the 1990s. It is now nearly universal in + telephone, cable and DSL modems, digital satellite links, and digital + mobile telephones. FEC is also heavily used in optical and magnetic + storage where "retransmissions" are not possible. + + Some systems use hybrid combinations of ARQ layered atop FEC; V.90 + dialup modems (in the upstream direction) with V.42 error control are + one example. Most errors are corrected by the trellis (FEC) code + within the V.90 modem, and most remaining errors are detected and + corrected by the ARQ mechanisms in V.42. + + Work is now underway to apply FEC above the physical layer, primarily + in connection with reliable multicasting [RFC3048] [RFC3450-RFC3453] + where conventional ARQ mechanisms are inefficient or difficult to + implement. However, in this discussion, we will assume that if FEC + is present, it is implemented within the physical layer. + + Depending on the layer in which it is implemented, error control can + operate on an end-to-end basis or over a shorter span, such as a + single link. TCP is the most important example of an end-to-end + protocol that uses an ARQ strategy. + + Many link-layer protocols use ARQ, usually some flavor of HDLC + [ISO3309]. Examples include the X.25 link layer, the AX.25 protocol + used in amateur packet radio, 802.11 wireless LANs, and the reliable + link layer specified in IEEE 802.2. + + Only end-to-end error recovery can ensure reliable service to the + application (see Section 8). However, some subnetworks (e.g., many + wireless links) also have link-layer error recovery as a performance + enhancement [RFC3366]. For example, many cellular links have small + physical frame sizes (< 100 bytes) and relatively high frame loss + rates. Relying solely on end-to-end error recovery can clearly yield + a performance degradation, as retransmissions across the end-to-end + path take much longer to be received than when link layer + retransmissions are used. Thus, link-layer error recovery can often + increase end-to-end performance. As a result, link-layer and end- + to-end recovery often co-exist; this can lead to the possibility of + inefficient interactions between the two layers of ARQ protocols. + + This inter-layer "competition" might lead to the following wasteful + situation. When the link layer retransmits (parts of) a packet, the + link latency momentarily increases. Since TCP bases its + + + +Karn, et al. Best Current Practice [Page 15] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + retransmission timeout on prior measurements of total end-to-end + latency, including that of the link in question, this sudden increase + in latency may trigger an unnecessary retransmission by TCP of a + packet that the link layer is still retransmitting. Such spurious + end-to-end retransmissions generate unnecessary load and reduce end- + to-end throughput. As a result, the link layer may even have + multiple copies of the same packet in the same link queue at the same + time. In general, one could say the competing error recovery is + caused by an inner control loop (link-layer error recovery) reacting + to the same signal as an outer control loop (end-to-end error + recovery) without any coordination between the loops. Note that this + is solely an efficiency issue; TCP continues to provide reliable + end-to-end delivery over such links. + + This raises the question of how persistent a link-layer sender should + be in performing retransmission [RFC3366]. We define the link-layer + (LL) ARQ persistency as the maximum time that a particular link will + spend trying to transfer a packet before it can be discarded. This + deliberately simplified definition says nothing about the maximum + number of retransmissions, retransmission strategies, queue sizes, + queuing disciplines, transmission delays, or the like. The reason we + use the term LL ARQ persistency, instead of a term such as "maximum + link-layer packet holding time," is that the definition closely + relates to link-layer error recovery. For example, on links that + implement straightforward error recovery strategies, LL ARQ + persistency will often correspond to a maximum number of + retransmissions permitted per link-layer frame. + + For link layers that do not or cannot differentiate between flows + (e.g., due to network layer encryption), the LL ARQ persistency + should be small. This avoids any harmful effects or performance + degradation resulting from indiscriminate high persistence. A + detailed discussion of these issues is provided in [RFC3366]. + + However, when a link layer can identify individual flows and apply + ARQ selectively [LKJK02], then the link ARQ persistency should be + high for a flow using reliable unicast transport protocols (e.g., + TCP) and must be low for all other flows. Setting the link ARQ + persistency larger than the largest link outage allows TCP to rapidly + restore transmission without needing to wait for a retransmission + time out. This generally improves TCP performance in the face of + transient outages. However, excessively high persistence may be + disadvantageous; a practical upper limit of 30-60 seconds may be + desirable. Implementation of such schemes remains a research issue. + (See also the following section "Recovery from Subnetwork Outages"). + + + + + + +Karn, et al. Best Current Practice [Page 16] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + Many subnetwork designers have opportunities to reduce the + probability of packet loss, e.g., with FEC, ARQ, and interleaving, at + the cost of increased delay. TCP performance improves with + decreasing loss but worsens with increasing end-to-end delay, so it + is important to find the proper balance through analysis and + simulation. + +8.2. Recovery from Subnetwork Outages + + Some types of subnetworks, particularly mobile radio, are subject to + frequent temporary outages. For example, an active cellular data + user may drive or walk into an area (such as a tunnel) that is out of + range of any base station. No packets will be delivered successfully + until the user returns to an area with coverage. + + The Internet protocols currently provide no standard way for a + subnetwork to explicitly notify an upper layer protocol (e.g., TCP) + that it is experiencing an outage rather than severe congestion. + + Under these circumstances TCP will, after each unsuccessful + retransmission, wait even longer before trying again; this is its + "exponential back-off" algorithm. Furthermore, TCP will not discover + that the subnetwork outage has ended until its next retransmission + attempt. If TCP has backed off, this may take some time. This can + lead to extremely poor TCP performance over such subnetworks. + + It is therefore highly desirable that a subnetwork subject to outages + does not silently discard packets during an outage. Ideally, the + subnetwork should define an interface to the next higher layer (i.e., + IP) that allows it to refuse packets during an outage, and to + automatically ask IP for new packets when it is again able to deliver + them. If it cannot do this, then the subnetwork should hold onto at + least some of the packets it accepts during an outage and attempt to + deliver them when the outage ends. When packets are discarded, IP + should be notified so that the appropriate ICMP messages can be sent. + + Note that it is *not* necessary to completely avoid dropping packets + during an outage. The purpose of holding onto a packet during an + outage, either in the subnetwork or at the IP layer, is so that its + eventual delivery will implicitly notify TCP that the subnetwork is + again operational. This is to enhance performance, not to ensure + reliability -- reliability, as discussed earlier, can only be ensured + on an end-to-end basis. + + Only a few packets per TCP connection, including ACKs, need be held + in this way to cause the TCP sender to recover from the additional + losses once the flow resumes [RFC3366]. + + + + +Karn, et al. Best Current Practice [Page 17] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + Because it would be a layering violation (and possibly a performance + hit) for IP or a subnetwork layer to look at TCP headers (which would + in any event be impossible if IPsec encryption [RFC2401] is in use), + it would be reasonable for the IP or subnetwork layers to choose, as + a design parameter, some small number of packets that will be + retained during an outage. + +8.3. CRCs, Checksums and Error Detection + + The TCP [RFC793], UDP [RFC768], ICMP, and IPv4 [RFC791] protocols all + use the same simple 16-bit 1's complement checksum algorithm + [RFC1071] to detect corrupted packets. The IPv4 header checksum + protects only the IPv4 header, while the TCP, ICMP, and UDP checksums + provide end-to-end error detection for both the transport pseudo + header (including network and transport layer information) and the + transport payload data. Protection of the data is optional for + applications using UDP [RFC768] for IPv4, but is required for IPv6. + + The Internet checksum is not very strong from a coding theory + standpoint, but it is easy to compute in software, and various + proposals to replace the Internet checksums with stronger checksums + have failed. However, it is known that undetected errors can and do + occur in packets received by end hosts [SP2000]. + + To reduce processing costs, IPv6 has no IP header checksum. The + destination host detects "important" errors in the IP header, such as + the delivery of the packet to the wrong destination. This is done by + including the IP source and destination addresses (pseudo header) in + the computation of the checksum in the TCP or UDP header, a practice + already performed in IPv4. Errors in other IPv6 header fields may go + undetected within the network; this was considered a reasonable price + to pay for a considerable reduction in the processing required by + each router, and it was assumed that subnetworks would use a strong + link CRC. + + One way to provide additional protection for an IPv4 or IPv6 header + is by the authentication and packet integrity services of the IP + Security (IPsec) protocol [RFC2401]. However, this may not be a + choice available to the subnetwork designer. + + Most subnetworks implement error detection just above the physical + layer. Packets corrupted in transmission are detected and discarded + before delivery to the IP layer. A 16-bit cyclic redundancy check + (CRC) is usually the minimum for error detection. This is + significantly more robust against most patterns of errors than the + 16-bit Internet checksum. Note that the error detection properties + of a specific CRC code diminish with increasing frame size. The + Point-to-Point Protocol [RFC1662] requires support of a 16-bit CRC + + + +Karn, et al. Best Current Practice [Page 18] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + for each link frame, with a 32-bit CRC as an option. (PPP is often + used in conjunction with a dialup modem, which provides its own error + control). Other subnetworks, including 802.3/Ethernet, AAL5/ATM, + FDDI, Token Ring, and PPP over SONET/SDH all use a 32-bit CRC. Many + subnetworks can also use other mechanisms to enhance the error + detection capability of the link CRC (e.g., FEC in dialup modems, + mobile radio and satellite channels). + + Any new subnetwork designed to carry IP should therefore provide + error detection for each IP packet that is at least as strong as the + 32-bit CRC specified in [ISO3309]. While this will achieve a very + low undetected packet error rate due to transmission errors, it will + not (and need not) achieve a very low packet loss rate as the + Internet protocols are better suited to dealing with lost packets + than to dealing with corrupted packets [SRC81]. + + Packet corruption may be, and is, also caused by bugs in host and + router hardware and software. Even if every subnetwork implemented + strong error detection, it is still essential that end-to-end + checksums are used at the receiving end host [SP2000]. + + Designers of complex subnetworks consisting of internal links and + packet switches should consider implementing error detection on an + edge-to-edge basis to cover an entire SNDU (or IP packet). A CRC + would be generated at the entry point to the subnetwork and checked + at the exit endpoint. This may be used instead of, or in combination + with, error detection at the interface to each physical link. An + edge-to-edge check has the significant advantage of protecting + against errors introduced anywhere within the subnetwork, not just + within its transmission links. Examples of this approach include the + way in which the Ethernet CRC-32 is handled by LAN bridges [802.1D]. + ATM AAL5 [ITU-I363] also uses an edge-to-edge CRC-32. + + Some specific applications may be tolerant of residual errors in the + data they exchange, but removal of the link CRC may expose the + network to an undesirable increase in undetected errors in the IP and + transport headers. Applications may also require a high level of + error protection for control information exchanged by protocols + acting above the transport layer. One example is a voice codec, + which is robust against bit errors in the speech samples. For such + mechanisms to work, the receiving application must be able to + tolerate receiving corrupted data. This also requires that an + application uses a mechanism to signal that payload corruption is + permitted and to indicate the coverage (headers and data) required to + be protected by the subnetwork CRC. The UDP-Lite protocol [RFC3828] + is the first Internet standards track transport protocol supporting + partial payload protection. Receipt of corrupt data by arbitrary + + + + +Karn, et al. Best Current Practice [Page 19] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + application protocols carries a serious danger that a subnet delivers + data with errors that remain undetected by the application and hence + corrupt the communicated data [SRC81]. + +8.4. How TCP Works + + One of TCP's functions is end-host based congestion control for the + Internet. This is a critical part of the overall stability of the + Internet, so it is important that link-layer designers understand + TCP's congestion control algorithms. + + TCP assumes that, at the most abstract level, the network consists of + links and queues. Queues provide output-buffering on links that are + momentarily oversubscribed. They smooth instantaneous traffic bursts + to fit the link bandwidth. When demand exceeds link capacity long + enough to fill the queue, packets must be dropped. The traditional + action of dropping the most recent packet ("tail dropping") is no + longer recommended [RFC2309] [RFC2914], but it is still widely + practiced. + + TCP uses sequence numbering and acknowledgments (ACKs) on an + end-to-end basis to provide reliable, sequenced delivery. TCP ACKs + are cumulative, i.e., each implicitly ACKs every segment received so + far. If a packet with an unexpected sequence number is received, the + ACK field in the packets returned by the receiver will cease to + advance. Using an optional enhancement, TCP can send selective + acknowledgments (SACKs) [RFC2018] to indicate which segments have + arrived at the receiver. + + Since the most common cause of packet loss is congestion, TCP treats + packet loss as an indication of potential Internet congestion along + the path between TCP end hosts. This happens automatically, and the + subnetwork need not know anything about IP or TCP. A subnetwork node + simply drops packets whenever it must, though some packet-dropping + strategies (e.g., RED) are more fair to competing flows than others. + + TCP recovers from packet losses in two different ways. The most + important mechanism is the retransmission timeout. If an ACK fails + to arrive after a certain period of time, TCP retransmits the oldest + unacked packet. Taking this as a hint that the network is congested, + TCP waits for the retransmission to be ACKed before it continues, and + it gradually increases the number of packets in flight as long as a + timeout does not occur again. + + A retransmission timeout can impose a significant performance + penalty, as the sender is idle during the timeout interval and + restarts with a congestion window of one TCP segment following the + + + + +Karn, et al. Best Current Practice [Page 20] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + timeout. To allow faster recovery from the occasional lost packet in + a bulk transfer, an alternate scheme, known as "fast recovery", was + introduced [RFC2581] [RFC2582] [RFC2914] [TCPF98]. + + Fast recovery relies on the fact that when a single packet is lost in + a bulk transfer, the receiver continues to return ACKs to subsequent + data packets that do not actually acknowledge any newly-received + data. These are known as "duplicate acknowledgments" or "dupacks". + The sending TCP can use dupacks as a hint that a packet has been lost + and retransmit it without waiting for a timeout. Dupacks effectively + constitute a negative acknowledgment (NAK) for the packet sequence + number in the acknowledgment field. TCP waits until a certain number + of dupacks (currently 3) are seen prior to assuming a loss has + occurred; this helps avoid an unnecessary retransmission during + out-of-sequence delivery. + + A technique called "Explicit Congestion Notification" (ECN) [RFC3168] + allows routers to directly signal congestion to hosts without + dropping packets. This is done by setting a bit in the IP header. + Since ECN support is likely to remain optional, the lack of an ECN + bit must *never* be interpreted as a lack of congestion. Thus, for + the foreseeable future, TCP must interpret a lost packet as a signal + of congestion. + + The TCP "congestion avoidance" [RFC2581] algorithm maintains a + congestion window (cwnd) controlling the amount of data TCP may have + in flight at any moment. Reducing cwnd reduces the overall bandwidth + obtained by the connection; similarly, raising cwnd increases + performance, up to the limit of the available capacity. + + TCP probes for available network capacity by initially setting cwnd + to one or two packets and then increasing cwnd by one packet for each + ACK returned from the receiver. This is TCP's "slow start" + mechanism. When a packet loss is detected (or congestion is signaled + by other mechanisms), cwnd is reset to one and the slow start process + is repeated until cwnd reaches one half of its previous setting + before the reset. Cwnd continues to increase past this point, but at + a much slower rate than before. If no further losses occur, cwnd + will ultimately reach the window size advertised by the receiver. + + This is an "Additive Increase, Multiplicative Decrease" (AIMD) + algorithm. The steep decrease of cwnd in response to congestion + provides for network stability; the AIMD algorithm also provides for + fairness between long running TCP connections sharing the same path. + + + + + + + +Karn, et al. Best Current Practice [Page 21] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +8.5. TCP Performance Characteristics + + Caveat + + Here we present a current "state-of-the-art" understanding of TCP + performance. This analysis attempts to characterize the performance + of TCP connections over links of varying characteristics. + + Link designers may wish to use the techniques in this section to + predict what performance TCP/IP may achieve over a new link-layer + design. Such analysis is encouraged. Because this is a relatively + new analysis, and the theory is based on single-stream TCP + connections under "ideal" conditions, it should be recognized that + the results of such analysis may differ from actual performance in + the Internet. That being said, we have done our best to provide the + designers with helpful information to get an accurate picture of the + capabilities and limitations of TCP under various conditions. + +8.5.1. The Formulae + + The performance of TCP's AIMD Congestion Avoidance algorithm has been + extensively analyzed. The current best formula for the performance + of the specific algorithms used by Reno TCP (i.e., the TCP specified + in [RFC2581]) is given by Padhye, et al. [PFTK98]. This formula is: + + MSS + BW = -------------------------------------------------------- + RTT*sqrt(1.33*p) + RTO*p*[1+32*p^2]*min[1,3*sqrt(.75*p)] + + where + + BW is the maximum TCP throughout achievable by an + individual TCP flow + MSS is the TCP segment size being used by the connection + RTT is the end-to-end round trip time of the TCP connection + RTO is the packet timeout (based on RTT) + p is the packet loss rate for the path + (i.e., .01 if there is 1% packet loss) + + Note that the speed of the links making up the Internet path does not + explicitly appear in this formula. Attempting to send faster than + the slowest link in the path causes the queue to grow at the + transmitter driving the bottleneck. This increases the RTT, which in + turn reduces the achievable throughput. + + This is currently considered to be the best approximate formula for + Reno TCP performance. A further simplification of this formula is + generally made by assuming that RTO is approximately 5*RTT. + + + +Karn, et al. Best Current Practice [Page 22] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + TCP is constantly being improved. A simpler formula, which gives an + upper bound on the performance of any AIMD algorithm which is likely + to be implemented in TCP in the future, was derived by Ott, et al. + [MSMO97]. + + MSS 1 + BW = C --- ------- + RTT sqrt(p) + + where C is 0.93. + +8.5.2. Assumptions + + Both formulae assume that the TCP Receiver Window is not limiting the + performance of the connection. Because the receiver window is + entirely determined by end-hosts, we assume that hosts will maximize + the announced receiver window to maximize their network performance. + + Both of these formulae allow BW to become infinite if there is no + loss. However, an Internet path will drop packets at bottlenecked + queues if the load is too high. Thus, a completely lossless TCP/IP + network can never occur (unless the network is being underutilized). + + The RTT used is the arithmetic average, including queuing delays. + + The formulae are for a single TCP connection. If a path carries many + TCP connections, each will follow the formulae above independently. + + The formulae assume long-running TCP connections. For connections + that are extremely short (<10 packets) and don't lose any packets, + performance is driven by the TCP slow-start algorithm. For + connections of medium length, where on average only a few segments + are lost, single connection performance will actually be slightly + better than given by the formulae above. + + The difference between the simple and complex formulae above is that + the complex formula includes the effects of TCP retransmission + timeouts. For very low levels of packet loss (significantly less + than 1%), timeouts are unlikely to occur, and the formulae lead to + very similar results. At higher packet losses (1% and above), the + complex formula gives a more accurate estimate of performance (which + will always be significantly lower than the result from the simple + formula). + + Note that these formulae break down as p approaches 100%. + + + + + + +Karn, et al. Best Current Practice [Page 23] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +8.5.3. Analysis of Link-Layer Effects on TCP Performance + + Consider the following example: + + A designer invents a new wireless link layer which, on average, loses + 1% of IP packets. The link layer supports packets of up to 1040 + bytes, and has a one-way delay of 20 msec. + + If this link were to be used on an Internet path with a round trip + time greater than 80ms, the upper bound may be computed by: + + For MSS, use 1000 bytes to exclude the 40 bytes of minimum IPv4 and + TCP headers. + + For RTT, use 120 msec (80 msec for the Internet part, plus 20 msec + each way for the new wireless link). + + For p, use .01. For C, assume 1. + + The simple formula gives: + + BW = (1000 * 8 bits) / (.120 sec * sqrt(.01)) = 666 kbit/sec + + The more complex formula gives: + + BW = 402.9 kbit/sec + + If this were a 2 Mb/s wireless LAN, the designers might be somewhat + disappointed. + + Some observations on performance: + + 1. We have assumed that the packet losses on the link layer are + interpreted as congestion by TCP. This is a "fact of life" that + must be accepted. + + 2. The equations for TCP performance are all expressed in terms of + packet loss, but many subnetwork designers think in terms of + bit-error ratio. *If* channel bit errors are independent, then + the probability of a packet being corrupted is: + + p = 1 - ([1 - BER]^[FRAME_SIZE*8]) + + Here we assume FRAME_SIZE is in bytes and "^" represents + exponentiation. It includes the user data and all headers + (TCP,IP and subnetwork). (Note: this analysis assumes the + + + + + +Karn, et al. Best Current Practice [Page 24] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + subnetwork does not perform ARQ or transparent fragmentation + [RFC3366].) If the inequality + + BER * [FRAME_SIZE*8] << 1 + + holds, the packet loss probability p can be approximated by: + + p = BER * [FRAME_SIZE*8] + + These equations can be used to apply BER to the performance + equations above. + + Note that FRAME_SIZE can vary from one packet to the next. Small + packets (such as TCP acks) generally have a smaller probability + of packet error than, say, a TCP packet carrying one MSS (maximum + segment size) of user data. A flow of small TCP acks can be + expected to be slightly more reliable than a stream of larger TCP + data segments. + + It bears repeating that the above analysis assumes that bit + errors are statistically independent. Because this is not true + for many real links, our computation of p is actually an upper + bound, not the exact probability of packet loss. + + There are many reasons why bit errors are not independent on real + links. Many radio links are affected by propagation fading or by + interference that lasts over many bit times. Also, links with + Forward Error Correction (FEC) generally have very non-uniform + bit error distributions that depend on the type of FEC, but in + general the uncorrected errors tend to occur in bursts even when + channel symbol errors are independent. In all such cases, our + computation of p from BER can only place an upper limit on the + packet loss rate. + + If the distribution of errors under the FEC scheme is known, one + could apply the same type of analysis as above, using the correct + distribution function for the BER. It is more likely in these + FEC cases, however, that empirical methods are needed to + determine the actual packet loss rate. + + 3. Note that the packet size plays an important role. If the + subnetwork loss characteristics are such that large packets have + the same probability of loss as smaller packets, then larger + packets will yield improved performance. + + + + + + + +Karn, et al. Best Current Practice [Page 25] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + 4. We have chosen a specific RTT that might occur on a wide-area + Internet path within the USA. It is important to recognize that + a variety of RTT values are experienced in the Internet. + + For example, RTTs are typically less than 10 msec in a wired LAN + environment when communicating with a local host. International + connections may have RTTs of 200 msec or more. Modems and other + low-capacity links can add considerable delay due to their long + packet transmission (serialisation) times. + + Links over geostationary repeater satellites have one-way speed- + of-light delays of around 250ms, a minimum of 125ms propagation + delay up to the satellite and 125ms down. The RTT of an end-to- + end TCP connection that includes such a link can be expected to + be greater than 250ms. + + Queues on heavily-congested links may back up, increasing RTTs. + Finally, virtual private networks (VPNs) and other forms of + encryption and tunneling can add significant end-to-end delay to + network connections. + +9. Quality-of-Service (QoS) considerations + + It is generally recognized that specific service guarantees are + needed to support real-time multimedia, toll-quality telephony, and + other performance-critical applications. The provision of such + Quality of Service guarantees in the Internet is an active area of + research and standardization. The IETF has not converged on a single + service model, set of services, or single mechanism that will offer + useful guarantees to applications and be scalable to the Internet. + Indeed, the IETF does not have a single definition of Quality of + Service. [RFC2990] represents a current understanding of the + challenges in architecting QoS for the Internet. + + There are presently two architectural approaches to providing + mechanisms for QoS support in the Internet. + + IP Integrated Services (Intserv) [RFC1633] provides fine-grained + service guarantees to individual flows. Flows are identified by a + flow specification (flowspec), which creates a stateful association + between individual packets by matching fields in the packet header. + Capacity is reserved for the flow, and appropriate traffic + conditioning and scheduling is installed in routers along the path. + The ReSerVation Protocol (RSVP) [RFC2205] [RFC2210] is usually, but + need not necessarily be, used to install the flow QoS state. Intserv + defines two services, in addition to the Default (best effort) + service. + + + + +Karn, et al. Best Current Practice [Page 26] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + 1. Guaranteed Service (GS) [RFC2212] offers hard upper bounds on + delay to flows that conform to a traffic specification (TSpec). + It uses a fluid-flow model to relate the TSpec and reserved + bandwidth (RSpec) to variable delay. Non-conforming packets are + forwarded on a best-effort basis. + + 2. Controlled Load Service (CLS) [RFC2211] offers delay and packet + loss equivalent to that of an unloaded network to flows that + conform to a TSpec, but no hard bounds. Non-conforming packets + are forwarded on a best-effort basis. + + Intserv requires installation of state information in every + participating router. Performance guarantees cannot be made unless + this state is present in every router along the path. This, along + with RSVP processing and the need for usage-based accounting, is + believed to have scalability problems, particularly in the core of + the Internet [RFC2208]. + + IP Differentiated Services (Diffserv) [RFC2475] provides a "toolkit" + offering coarse-grained controls to aggregates of flows. Diffserv in + itself does *not* provide QoS guarantees, but can be used to + construct services with QoS guarantees across a Diffserv domain. + Diffserv attempts to address the scaling issues associated with + Intserv by requiring state awareness only at the edge of a Diffserv + domain. At the edge, packets are classified into flows, and the + flows are conditioned (marked, policed, or shaped) to a traffic + conditioning specification (TCS). A Diffserv Codepoint (DSCP), + identifying a per-hop behavior (PHB), is set in each packet header. + The DSCP is carried in the DS-field, subsuming six bits of the former + Type-of-Service (ToS) byte [RFC791] of the IP header [RFC2474]. The + PHB denotes the forwarding behavior to be applied to the packet in + each node in the Diffserv domain. Although there is a "recommended" + DSCP associated with each PHB, the mappings from DSCPs to PHBs are + defined by the DS-domain. In fact, there can be several DSCPs + associated with the same PHB. Diffserv presently defines three PHBs. + + 1. The class selector PHB [RFC2474] replaces the IP precedence field + of the former ToS byte. It offers relative forwarding + priorities. + + 2. The Expedited Forwarding (EF) PHB [RFC3246] [RFC3248] guarantees + that packets will have a well-defined minimum departure rate + which, if not exceeded, ensures that the associated queues are + short or empty. EF is intended to support services that offer + tightly-bounded loss, delay, and delay jitter. + + + + + + +Karn, et al. Best Current Practice [Page 27] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + 3. The Assured Forwarding (AF) PHB group [RFC2597] offers different + levels of forwarding assurance for each aggregated flow of + packets. Each AF group is independently allocated forwarding + resources. Packets are marked with one of three drop + precedences; those with the highest drop precedence are dropped + with lower probability than those marked with the lowest drop + precedence. DSCPs are recommended for four independent AF + groups, although a DS domain can have more or fewer AF groups. + + Ongoing work in the IETF is addressing ways to support Intserv with + Diffserv. There is some belief (e.g., as expressed in [RFC2990]) + that such an approach will allow individual flows to receive service + guarantees and scale to the global Internet. + + The QoS guarantees that can be offered by the IP layer are a product + of two factors: + + 1. the concatenation of the QoS guarantees offered by the subnets + along the path of a flow. This implies that a subnet may wish to + offer multiple services (with different QoS guarantees) to the IP + layer, which can then determine which flows use which subnet + service. To put it another way, forwarding behavior in the + subnet needs to be "clued" by the forwarding behavior (service or + PHB) at the IP layer, and + + 2. the operation of a set of cooperating mechanisms, such as + bandwidth reservation and admission control, policy management, + traffic classification, traffic conditioning (marking, policing + and/or shaping), selective discard, queuing, and scheduling. + Note that support for QoS in subnets may require similar + mechanisms, especially when these subnets are general topology + subnets (e.g., ATM, frame relay, or MPLS) or shared media + subnets. + + Many subnetwork designers face inherent tradeoffs between delay, + throughput, reliability, and cost. Other subnetworks have parameters + that manage bandwidth, internal connection state, and the like. + Therefore, the following subnetwork capabilities may be desirable, + although some might be trivial or moot if the subnet is a dedicated + point-to-point link. + + 1. The subnetwork should have the ability to reserve bandwidth for a + connection or flow and schedule packets accordingly. + + 2. Bandwidth reservations should be based on a one- or two-token + bucket model, depending on whether the service is intended to + support constant-rate or bursty traffic. + + + + +Karn, et al. Best Current Practice [Page 28] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + 3. If a connection or flow does not use its reserved bandwidth at a + given time, the unused bandwidth should be available for other + flows. + + 4. Packets in excess of a connection or flow's agreed rate should be + forwarded as best-effort or discarded, depending on the service + offered by the subnet to the IP layer. + + 5. If a subnet contains error control mechanisms (retransmission + and/or FEC), it should be possible for the IP layer to influence + the inherent tradeoffs between uncorrected errors, packet losses, + and delay. These capabilities at the subnet/IP layer service + boundary correspond to selection of more or less error control + and/or to selection of particular error control mechanisms within + the subnetwork. + + 6. The subnet layer should know, and be able to inform the IP layer, + how much fixed delay and delay jitter it offers for a flow or + connection. If the Intserv model is used, the delay jitter + component may be best expressed in terms of the TSpec/RSpec model + described in [RFC2212]. + + 7. Support of the Diffserv class selectors [RFC2474] suggests that + the subnet might consider mechanisms that support priorities. + +10. Fairness vs Performance + + Subnetwork designers should be aware of the tradeoffs between + fairness and efficiency inherent in many transmission scheduling + algorithms. For example, many local area networks use contention + protocols to resolve access to a shared transmission channel. These + protocols represent overhead. While limiting the amount of data that + a subnet node may transmit per contention cycle helps assure timely + access to the channel for each subnet node, it also increases + contention overhead per unit of data sent. + + In some mobile radio networks, capacity is limited by interference, + which in turn depends on average transmitter power. Some receivers + may require considerably more transmitter power (generating more + interference and consuming more channel capacity) than others. + + In each case, the scheduling algorithm designer must balance + competing objectives: providing a fair share of capacity to each + subnet node while maximizing the total capacity of the network. One + approach for balancing performance and fairness is outlined in + [ES00]. + + + + + +Karn, et al. Best Current Practice [Page 29] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +11. Delay Characteristics + + The TCP sender bases its retransmission timeout (RTO) on measurements + of the round trip delay experienced by previous packets. This allows + TCP to adapt automatically to the very wide range of delays found on + the Internet. The recommended algorithms are described in [RFC2988]. + Evaluations of TCP's retransmission timer can be found in [AP99] and + [LS00]. + + These algorithms model the delay along an Internet path as a + normally-distributed random variable with a slowly-varying mean and + standard deviation. TCP estimates these two parameters by + exponentially smoothing individual delay measurements, and it sets + the RTO to the estimated mean delay plus some fixed number of + standard deviations. (The algorithm actually uses mean deviation as + an approximation to standard deviation, because it is easier to + compute.) + + The goal is to compute an RTO that is small enough to detect and + recover from packet losses while minimizing unnecessary ("spurious") + retransmissions when packets are unexpectedly delayed but not lost. + Although these goals conflict, the algorithm works well when the + delay variance along the Internet path is low, or the packet loss + rate is low. + + If the path delay variance is high, TCP sets an RTO that is much + larger than the mean of the measured delays. If the packet loss rate + is low, the large RTO is of little consequence, as timeouts occur + only rarely. Conversely, if the path delay variance is low, then TCP + recovers quickly from lost packets; again, the algorithm works well. + However, when delay variance and the packet loss rate are both high, + these algorithms perform poorly, especially when the mean delay is + also high. + + Because TCP uses returning acknowledgments as a "clock" to time the + transmission of additional data, excessively high delays (even if the + delay variance is low) also affect TCP's ability to fully utilize a + high-speed transmission pipe. It also slows the recovery of lost + packets, even when delay variance is small. + + Subnetwork designers should therefore minimize all three parameters + (delay, delay variance, and packet loss) as much as possible. + + In many subnetworks, these parameters are inherently in conflict. + For example, on a mobile radio channel, the subnetwork designer can + use retransmission (ARQ) and/or forward error correction (FEC) to + trade off delay, delay variance, and packet loss in an effort to + improve TCP performance. While ARQ increases delay variance, FEC + + + +Karn, et al. Best Current Practice [Page 30] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + does not. However, FEC (especially when combined with interleaving) + often increases mean delay, even on good channels where ARQ + retransmissions are not needed and ARQ would not increase either the + delay or the delay variance. + + The tradeoffs among these error control mechanisms and their + interactions with TCP can be quite complex, and are the subject of + much ongoing research. We therefore recommend that subnetwork + designers provide as much flexibility as possible in the + implementation of these mechanisms, and provide access to them as + discussed above in the section on Quality of Service. + +12. Bandwidth Asymmetries + + Some subnetworks may provide asymmetric bandwidth (or may cause TCP + packet flows to experience asymmetry in the capacity) and the + Internet protocol suite will generally still work fine. However, + there is a case when such a scenario reduces TCP performance. Since + TCP data segments are "clocked" out by returning acknowledgments, TCP + senders are limited by the rate at which ACKs can be returned + [BPK98]. Therefore, when the ratio of the available capacity of the + Internet path carrying the data to the bandwidth of the return path + of the acknowledgments is too large, the slow return of the ACKs + directly impacts performance. Since ACKs are generally smaller than + data segments, TCP can tolerate some asymmetry, but as a general + rule, designers of subnetworks should be aware that subnetworks with + significant asymmetry can result in reduced performance, unless + issues are taken to mitigate this [RFC3449]. + + Several strategies have been identified for reducing the impact of + asymmetry of the network path between two TCP end hosts, e.g., + [RFC3449]. These techniques attempt to reduce the number of ACKs + transmitted over the return path (low bandwidth channel) by changes + at the end host(s), and/or by modification of subnetwork packet + forwarding. While these solutions may mitigate the performance + issues caused by asymmetric subnetworks, they do have associated cost + and may have other implications. A fuller discussion of strategies + and their implications is provided in [RFC3449]. + +13. Buffering, flow and congestion control + + Many subnets include multiple links with varying traffic demands and + possibly different transmission speeds. At each link there must be a + queuing system, including buffering, scheduling, and a capability to + discard excess subnet packets. These queues may also be part of a + subnet flow control or congestion control scheme. + + + + + +Karn, et al. Best Current Practice [Page 31] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + For the purpose of this discussion, we talk about packets without + regard to whether they refer to a complete IP packet or a subnetwork + frame. At each queue, a packet experiences a delay that depends on + competing traffic and the scheduling discipline, and is subjected to + a local discarding policy. + + Some subnets may have flow or congestion control mechanisms in + addition to packet dropping. Such mechanisms can operate on + components in the subnet layer, such as schedulers, shapers, or + discarders, and can affect the operation of IP forwarders at the + edges of the subnet. However, with the exception of Explicit + Congestion Notification [RFC3168] (discussed below), IP has no way to + pass explicit congestion or flow control signals to TCP. + + TCP traffic, especially aggregated TCP traffic, is bursty. As a + result, instantaneous queue depths can vary dramatically, even in + nominally stable networks. For optimal performance, packets should + be dropped in a controlled fashion, not just when buffer space is + unavailable. How much buffer space should be supplied is still a + matter of debate, but as a rule of thumb, each node should have + enough buffering to hold one link_bandwidth*link_delay product's + worth of data for each TCP connection sharing the link. + + This is often difficult to estimate, since it depends on parameters + beyond the subnetwork's control or knowledge. Internet nodes + generally do not implement admission control policies, and cannot + limit the number of TCP connections that use them. In general, it is + wise to err in favor of too much buffering rather than too little. + It may also be useful for subnets to incorporate mechanisms that + measure propagation delays to assist in buffer sizing calculations. + + There is a rough consensus in the research community that active + queue management is important to improving fairness, link + utilization, and throughput [RFC2309]. Although there are questions + and concerns about the effectiveness of active queue management + (e.g., [MBDL99]), it is widely considered an improvement over tail- + drop discard policies. + + One form of active queue management is the Random Early Detection + (RED) algorithm [RED93], a family of related algorithms. In one + version of RED, an exponentially-weighted moving average of the queue + depth is maintained: + + When this average queue depth is between a maximum threshold + max_th and a minimum threshold min_th, the probability of packets + that are dropped is proportional to the amount by which the + average queue depth exceeds min_th. + + + + +Karn, et al. Best Current Practice [Page 32] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + When this average queue depth is equal to max_th, the drop + probability is equal to a configurable parameter max_p. + + When this average queue depth is greater than max_th, packets are + always dropped. + + Numerous variants on RED appear in the literature, and there are + other active queue management algorithms which claim various + advantages over RED [GM02]. + + With an active queue management algorithm, dropped packets become a + feedback signal to trigger more appropriate congestion behavior by + the TCPs in the end hosts. Randomization of dropping tends to break + up the observed tendency of TCP windows belonging to different TCP + connections to become synchronized by correlated drops, and it also + imposes a degree of fairness on those connections that implement TCP + congestion avoidance properly. Another important property of active + queue management algorithms is that they attempt to keep average + queue depths short while accommodating large short-term bursts. + + Since TCP neither knows nor cares whether congestive packet loss + occurs at the IP layer or in a subnet, it may be advisable for + subnets that perform queuing and discarding to consider implementing + some form of active queue management. This is especially true if + large aggregates of TCP connections are likely to share the same + queue. However, active queue management may be less effective in the + case of many queues carrying smaller aggregates of TCP connections, + e.g., in an ATM switch that implements per-VC queuing. + + Note that the performance of active queue management algorithms is + highly sensitive to settings of configurable parameters, and also to + factors such as RTT [MBB00] [FB00]. + + Some subnets, most notably ATM, perform segmentation and reassembly + at the subnetwork edges. Care should be taken here in designing + discard policies. If the subnet discards a fragment of an IP packet, + then the remaining fragments become an unproductive load on the + subnet that can markedly degrade end-to-end performance [RF95]. + Subnetworks should therefore attempt to discard these extra fragments + whenever one of them must be discarded. If the IP packet has already + been partially forwarded when discarding becomes necessary, then + every remaining fragment except the one marking the end of the IP + packet should also be discarded. For ATM subnets, this specifically + means using Early Packet Discard and Partial Packet Discard [ATMFTM]. + + Some subnets include flow control mechanisms that effectively require + that the rate of traffic flows be shaped upon entry to the subnet. + One example of such a subnet mechanism is in the ATM Available Bit + + + +Karn, et al. Best Current Practice [Page 33] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + rate (ABR) service category [ATMFTM]. Such flow control mechanisms + have the effect of making the subnet nearly lossless by pushing + congestion into the IP routers at the edges of the subnet. In such a + case, adequate buffering and discard policies are needed in these + routers to deal with a subnet that appears to have varying bandwidth. + Whether there is a benefit in this kind of flow control is + controversial; there are numerous simulation and analytical studies + that go both ways. It appears that some of the issues leading to + such different results include sensitivity to ABR parameters, use of + binary rather than explicit rate feedback, use (or not) of per-VC + queuing, and the specific ATM switch algorithms selected for the + study. Anecdotally, some large networks that used IP over ABR to + carry TCP traffic have claimed it to be successful, but have + published no results. + + Another possible approach to flow control in the subnet would be to + work with TCP Explicit Congestion Notification (ECN) semantics + [RFC3168] through utilizing explicit congestion indicators in subnet + frames. Routers at the edges of the subnet, rather than shaping, + would set the explicit congestion bit in those IP packets that are + received in subnet frames that have an ECN indication. Nodes in the + subnet would need to implement an active queue management protocol + that marks subnet frames instead of dropping them. + + ECN is currently a proposed standard, but it is not yet widely + deployed. + +14. Compression + + Application data compression is a function that can usually be + omitted in the subnetwork. The endpoints typically have more CPU and + memory resources to run a compression algorithm and a better + understanding of what is being compressed. End-to-end compression + benefits every network element in the path, while subnetwork-layer + compression, by definition, benefits only a single subnetwork. + + Data presented to the subnetwork layer may already be in a compressed + format (e.g., a JPEG file), compressed at the application layer + (e.g., the optional "gzip", "compress", and "deflate" compression in + HTTP/1.1 [RFC2616]), or compressed at the IP layer (the IP Payload + Compression Protocol [RFC3173] supports DEFLATE [RFC2394] and LZS + [RFC2395]). Compression at the subnetwork edges is of no benefit for + any of these cases. + + The subnetwork may also process data that has been encrypted by the + application (OpenPGP [RFC2440] or S/MIME [RFC2633]), just above TCP + (SSL, TLS [RFC2246]), or just above IP (IPsec ESP [RFC2406]). + + + + +Karn, et al. Best Current Practice [Page 34] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + Ciphers generate high-entropy bit streams lacking any patterns that + can be exploited by a compression algorithm. + + However, much data is still transmitted uncompressed over the + Internet, so subnetwork compression may be beneficial. Any + subnetwork compression algorithm must not expand uncompressible data, + e.g., data that has already been compressed or encrypted. + + We make a strong recommendation that subnetworks operating at low + speed or with small MTUs compress IP and transport-level headers (TCP + and UDP) using several header compression schemes developed within + the IETF [RFC3150]. An uncompressed 40-byte TCP/IP header takes + about 33 milliseconds to send at 9600 bps. "VJ" TCP/IP header + compression [RFC1144] compresses most headers to 3-5 bytes, reducing + transmission time to several milliseconds on dialup modem links. + This is especially beneficial for small, latency-sensitive packets in + interactive sessions. + + Similarly, RTP compression schemes, such as CRTP [RFC2508] and ROHC + [RFC3095], compress most IP/UDP/RTP headers to 1-4 bytes. The + resulting savings are especially significant when audio packets are + kept small to minimize store-and-forward latency. + + Designers should consider the effect of the subnetwork error rate on + the performance of header compression. TCP ordinarily recovers from + lost packets by retransmitting only those packets that were actually + lost; packets arriving correctly after a packet loss are kept on a + resequencing queue and do not need to be retransmitted. In VJ TCP/IP + [RFC1144] header compression, however, the receiver cannot explicitly + notify a sender of data corruption and subsequent loss of + synchronization between compressor and decompressor. It relies + instead on TCP retransmission to re-synchronize the decompressor. + After a packet is lost, the decompressor must discard every + subsequent packet, even if the subnetwork makes no further errors, + until the sending TCP retransmits to re-synchronize the decompressor. + This effect can substantially magnify the effect of subnetwork packet + losses if the sending TCP window is large, as it will often be on a + path with a large bandwidth*delay product [LRKOJ99]. + + Alternate header compression schemes, such as those described in + [RFC2507], include an explicit request for retransmission of an + uncompressed packet to allow decompressor resynchronization without + waiting for a TCP retransmission. However, these schemes are not yet + in widespread use. + + Both TCP header compression schemes do not compress widely-used TCP + options such as selective acknowledgements (SACK). Both fail to + compress TCP traffic that makes use of explicit congestion + + + +Karn, et al. Best Current Practice [Page 35] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + notification (ECN). Work is under way in the IETF ROHC WG to address + these shortcomings in a ROHC header compression scheme for TCP + [RFC3095] [RFC3096]. + + The subnetwork error rate also is important for RTP header + compression. CRTP uses delta encoding, so a packet loss on the link + causes uncertainty about the subsequent packets, which often must be + discarded until the decompressor has notified the compressor and the + compressor has sent re-synchronizing information. This typically + takes slightly more than the end-to-end path round-trip time. For + links that combine significant error rates with latencies that + require multiple packets to be in flight at a time, this leads to + significant error propagation, i.e., subsequent losses caused by an + initial loss. + + For links that are both high-latency (multiple packets in flight from + a typical RTP stream) and error-prone, RTP ROHC provides a more + robust way of RTP header compression, at a cost of higher complexity + at the compressor and decompressor. For example, within a talk + spurt, only extended losses of (depending on the mode chosen) 12-64 + packets typically cause error propagation. + +15. Packet Reordering + + The Internet architecture does not guarantee that packets will arrive + in the same order in which they were originally transmitted; + transport protocols like TCP must take this into account. + + However, reordering does come at a cost with TCP as it is currently + defined. Because TCP returns a cumulative acknowledgment (ACK) + indicating the last in-order segment that has arrived, out-of-order + segments cause a TCP receiver to transmit a duplicate acknowledgment. + When the TCP sender notices three duplicate acknowledgments, it + assumes that a segment was dropped by the network and uses the fast + retransmit algorithm [Jac90] [RFC2581] to resend the segment. In + addition, the congestion window is reduced by half, effectively + halving TCP's sending rate. If a subnetwork reorders segments + significantly such that three duplicate ACKs are generated, the TCP + sender needlessly reduces the congestion window and performance + suffers. + + Packet reordering frequently occurs in parts of the Internet, and it + seems to be difficult or impossible to eliminate [BPS99]. For this + reason, research on improving TCP's behavior in the face of packet + reordering [LK00] [BA02] has begun. + + + + + + +Karn, et al. Best Current Practice [Page 36] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [BPS99] cites reasons why it may even be undesirable to eliminate + reordering. There are situations where average packet latency can be + reduced, link efficiency can be increased, and/or reliability can be + improved if reordering is permitted. Examples include certain high + speed switches within the Internet backbone and the parallel links + used over many Internet paths for load splitting and redundancy. + + This suggests that subnetwork implementers should try to avoid packet + reordering whenever possible, but not if doing so compromises + efficiency, impairs reliability, or increases average packet delay. + + Note that every header compression scheme currently standardized for + the Internet requires in-order packet delivery on the link between + compressor and decompressor. PPP is frequently used to carry + compressed TCP/IP packets; since it was originally designed for + point-to-point and dialup links, it is assumed to provide in-order + delivery. For this reason, subnetwork implementers who provide PPP + interfaces to VPNs and other more complex subnetworks, must also + maintain in-order delivery of PPP frames. + +16. Mobility + + Internet users are increasingly mobile. Not only are many Internet + nodes laptop computers, but pocket organizers and mobile embedded + systems are also becoming nodes on the Internet. These nodes may + connect to many different access points on the Internet over time, + and they expect this to be largely transparent to their activities. + Except when they are not connected to the Internet at all, and for + performance differences when they are connected, they expect that + everything will "just work" regardless of their current Internet + attachment point or local subnetwork technology. + + Changing a host's Internet attachment point involves one or more of + the following steps. + + First, if use of the local subnetwork is restricted, the user's + credentials must be verified and access granted. There are many ways + to do this. A trivial example would be an "Internet cafe" that + grants physical access to the subnetwork for a fee. Subnetworks may + implement technical access controls of their own; one example is IEEE + 802.11 Wireless Equivalent Privacy [IEEE80211]. It is common + practice for both cellular telephone and Internet service providers + (ISPs) to agree to serve one anothers' users; RADIUS [RFC2865] is the + standard method for ISPs to exchange authorization information. + + Second, the host may have to be reconfigured with IP parameters + appropriate for the local subnetwork. This usually includes setting + an IP address, default router, and domain name system (DNS) servers. + + + +Karn, et al. Best Current Practice [Page 37] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + On multiple-access networks, the Dynamic Host Configuration Protocol + (DHCP) [RFC2131] is almost universally used for this purpose. On PPP + links, these functions are performed by the IP Control Protocol + (IPCP) [RFC1332]. + + Third, traffic destined for the mobile host must be routed to its + current location. This roaming function is the most common meaning + of the term "Internet mobility". + + Internet mobility can be provided at any of several layers in the + Internet protocol stack, and there is ongoing debate as to which is + the most appropriate and efficient. Mobility is already a feature of + certain application layer protocols; the Post Office Protocol (POP) + [RFC1939] and the Internet Message Access Protocol (IMAP) [RFC3501] + were created specifically to provide mobility in the receipt of + electronic mail. + + Mobility can also be provided at the IP layer [RFC3344]. This + mechanism provides greater transparency, viz., IP addresses that + remain fixed as the nodes move, but at the cost of potentially + significant network overhead and increased delay because of the sub- + optimal network routing and tunneling involved. + + Some subnetworks may provide internal mobility, transparent to IP, as + a feature of their own internal routing mechanisms. To the extent + that these simplify routing at the IP layer, reduce the need for + mechanisms like Mobile IP, or exploit mechanisms unique to the + subnetwork, this is generally desirable. This is especially true + when the subnetwork covers a relatively small geographic area and the + users move rapidly between the attachment points within that area. + Examples of internal mobility schemes include Ethernet switching and + intra-system handoff in cellular telephony. + + However, if the subnetwork is physically large and connects to other + parts of the Internet at multiple geographic points, care should be + taken to optimize the wide-area routing of packets between nodes on + the external Internet and nodes on the subnet. This is generally + done with "nearest exit" routing strategies. Because a given + subnetwork may be unaware of the actual physical location of a + destination on another subnetwork, it simply routes packets bound for + the other subnetwork to the nearest router between the two. This + implies some awareness of IP addressing and routing within the + subnetwork. The subnetwork may wish to use IP routing internally for + wide area routing and restrict subnetwork-specific routing to + constrained geographic areas where the effects of suboptimal routing + are minimized. + + + + + +Karn, et al. Best Current Practice [Page 38] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +17. Routing + + Subnetworks connecting more than two systems must provide their own + internal Layer-2 forwarding mechanisms, either implicitly (e.g., + broadcast) or explicitly (e.g., switched). Since routing is the + major function of the Internet layer, the question naturally arises + as to the interaction between routing at the Internet layer and + routing in the subnet, and proper division of function between the + two. + + Layer-2 subnetworks can be point-to-point, connecting two systems, or + multipoint. Multipoint subnetworks can be broadcast (e.g., shared + media or emulated) or non-broadcast. Generally, IP considers + multipoint subnetworks as broadcast, with shared-medium Ethernet as + the canonical (and historical) example, and point-to-point + subnetworks as a degenerate case. Non-broadcast subnetworks may + require additional mechanisms, e.g., above IP at the routing layer + [RFC2328]. + + IP is ignorant of the topology of the subnetwork layer. In + particular, reconfiguration of subnetwork paths is not tracked by the + IP layer. IP is only affected by whether it can send/receive packets + sent to the remotely connected systems via the subnetwork interface + (i.e., the reachability from one router to another). IP further + considers that subnetworks are largely static -- that both their + membership and existence are stable at routing timescales (tens of + seconds); changes to these are considered re-provisioning, rather + than routing. + + Routing functionality in a subnetwork is related to addressing in + that subnetwork. Resolution of addresses on subnetwork links is + required for forwarding IP packets across links (e.g., ARP for IPv4, + or ND for IPv6). There is unlikely to be direct interaction between + subnetwork routing and IP routing. Where broadcast is provided or + explicitly emulated, address resolution can be used directly; where + not provided, the link layer routing may interface to a protocol for + resolution, e.g., to the Next-Hop Resolution Protocol [RFC2322] to + provide context-dependent address resolution capabilities. + + Subnetwork routing can either complement or compete with IP routing. + It complements IP when a subnetwork encapsulates its internal + routing, and where the effects of that routing are not visible at the + IP layer. However, if different paths in the subnetwork have + characteristics that affect IP routing, it can affect or even inhibit + the convergence of IP routing. + + + + + + +Karn, et al. Best Current Practice [Page 39] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + Routing protocols generally consider Layer-2 subnetworks, i.e., with + subnet masks and no intermediate IP hops, to have uniform routing + metrics to all members. Routing can break when a link's + characteristics do not match the routing metric, in this case, e.g., + when some member pairs have different path characteristics. Consider + a virtual Ethernet subnetwork that includes both nearby (sub- + millisecond latency) and remote (100's of milliseconds away) systems. + Presenting that group as a single subnetwork means that some routing + protocols will assume that all pairs have the same delay, and that + that delay is small. Because this is not the case, the routing + tables constructed may be suboptimal or may even fail to converge. + + When a subnetwork is used for transit between a set of routers, it + conventionally provides the equivalent of a full mesh of point-to- + point links. Simplicity of the internal subnet structure can be used + (e.g., via NHRP [RFC2332]) to reduce the size of address resolution + tables, but routing exchanges will continue to reflect the full mesh + they emulate. In general, subnetworks should not be used as a + transit among a set of routers where routing protocols would break if + a full mesh of equivalent point-to-point links were used. + + Some subnetworks have special features that allow the use of more + effective or responsive routing mechanisms that cannot be implemented + in IP because of its need for generality. One example is the self- + learning bridge algorithm widely used in Ethernet networks. Learning + bridges perform Layer-2 subnetwork forwarding, avoiding the need for + dynamic routing at each subnetwork hop. Another is the "handoff" + mechanism in cellular telephone networks, particularly the "soft + handoff" scheme in IS-95 CDMA. + + Subnetworks that cover large geographic areas or include links of + widely-varying capabilities should be avoided. IP routing generally + considers all multipoint subnets equivalent to a local, shared-medium + link with uniform metrics between any pair of systems, and ignores + internal subnetwork topology. Where a subnetwork diverges from that + assumption, it is the obligation of subnetwork designers to provide + compensating mechanisms. Not doing so can affect the scalability and + convergence of IP routing, as noted above. + + The subnetwork designer who decides to implement internal routing + should consider whether a custom routing algorithm is warranted, or + if an existing Internet routing algorithm or protocol may suffice. + The designer should consider whether this decision is to reduce the + address resolution table size (possible, but with additional protocol + support required), or is trying to reduce routing table complexity. + The latter may be better achieved by partitioning the subnetwork, + either physically or logically, and using network-layer protocols to + support partitioning (e.g., AS's in BGP). Protocols and routing + + + +Karn, et al. Best Current Practice [Page 40] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + algorithms can be notoriously subtle, complex, and difficult to + implement correctly. Much work can be avoided if existing protocols + or implementations can be readily reused. + +18. Security Considerations + + Security has become a high priority in the design and operation of + the Internet. The Internet is vast, and countless organizations and + individuals own and operate its various components. A consensus has + emerged for what might be called a "security placement principle": a + security mechanism is most effective when it is placed as close as + possible to, and under the direct control of the owner of the asset + that it protects. + + A corollary of this principle is that end-to-end security (e.g., + confidentiality, authentication, integrity, and access control) + cannot be ensured with subnetwork security mechanisms. Not only are + end-to-end security mechanisms much more closely associated with the + end-user assets they protect, they are also much more comprehensive. + For example, end-to-end security mechanisms cover gaps that can + appear when otherwise good subnetwork mechanisms are concatenated. + This is an important application of the end-to-end principle [SRC81]. + + Several security mechanisms that can be used end-to-end have already + been deployed in the Internet and are enjoying increasing use. The + most important are the Secure Sockets Layer (SSL) [SSL2] [SSL3] and + TLS [RFC2246] primarily used to protect web commerce, Pretty Good + Privacy (PGP) [RFC1991] and S/MIME [RFCs-2630-2634], primarily used + to protect and authenticate email and software distributions, the + Secure Shell (SSH), used for secure remote access and file transfer, + and IPsec [RFC2401], a general purpose encryption and authentication + mechanism that sits just above IP and can be used by any IP + application. (IPsec can actually be used either on an end-to-end + basis or between security gateways that do not include either or both + end systems.) + + Nonetheless, end-to-end security mechanisms are not used as widely as + might be desired. However, the group could not reach consensus on + whether subnetwork designers should be actively encouraged to + implement mechanisms to protect user data. + + The clear consensus of the working group held that subnetwork + security mechanisms, especially when weak or incorrectly implemented + [BGW01], may actually be counterproductive. The argument is that + subnetwork security mechanisms can lull end users into a false sense + of security, diminish the incentive to deploy effective end-to-end + + + + + +Karn, et al. Best Current Practice [Page 41] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + mechanisms, and encourage "risky" uses of the Internet that would not + be made if users understood the inherent limits of subnetwork + security mechanisms. + + The other point of view encourages subnetwork security on the + principle that it is better than the default situation, which all too + often is no security at all. Users of especially vulnerable subnets + (such as consumers who have wireless home networks and/or shared + media Internet access) often have control over at most one endpoint + -- usually a client -- and therefore cannot enforce the use of end- + to-end mechanisms. However, subnet security can be entirely adequate + for protecting low-valued assets against the most likely threats. In + any event, subnet mechanisms do not preclude the use of end-to-end + mechanisms, which are typically used to protect highly-valued assets. + This viewpoint recognizes that many security policies implicitly + assume that the entire end-to-end path is composed of a series of + concatenated links that are nominally physically secured. That is, + these policies assume that all endpoints of all links are trusted, + and that access to the physical medium by attackers is difficult. To + meet the assumptions of such policies, explicit mechanisms are needed + for links (especially shared medium links) that lack physical + protection. This, for example, is the rationale that underlies Wired + Equivalent Privacy (WEP) in the IEEE 802.11 [IEEE80211] wireless LAN + standard, and the Baseline Privacy Interface in the DOCSIS [DOCSIS1] + [DOCSIS2] data over cable television networks standards. + + We therefore recommend that subnetwork designers who choose to + implement security mechanisms to protect user data be as candid as + possible with the details of such security mechanisms and the + inherent limits of even the most secure mechanisms when implemented + in a subnetwork rather than on an end-to-end basis. + + In keeping with the "placement principle", a clear consensus exists + for another subnetwork security role: the protection of the + subnetwork itself. Possible threats to subnetwork assets include + theft of service and denial of service; shared media subnets tend to + be especially vulnerable to such attacks. In some cases, mechanisms + that protect subnet assets can also improve (but cannot ensure) end- + to-end security. + + One security service can be provided by the subnetwork that will aid + in the solution of an overall Internet problem: subnetwork security + should provide a mechanism to authenticate the source of a subnetwork + frame. This function is missing in some current protocols, e.g., the + use of ARP [RFC826] to associate an IPv4 address with a MAC address. + The IPv6 Neighbor Discovery (ND) [RFC2461] performs a similar + function. + + + + +Karn, et al. Best Current Practice [Page 42] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + There are well-known security flaws with this address resolution + mechanism [Wilbur89]. However, the inclusion of subnetwork frame + source authentication will permit a secure subnetwork address. + + Another potential role for subnetwork security is to protect users + against traffic analysis, i.e., identifying the communicating parties + and determining their communication patterns and volumes even when + their actual contents are protected by strong end-to-end security + mechanisms. Lower-layer security can be more effective against + traffic analysis due to its inherent ability to aggregate the + communications of multiple parties sharing the same physical + facilities while obscuring higher-layer protocol information that + indicates specific end points, such as IP addresses and TCP/UDP port + numbers. + + However, traffic analysis is a notoriously subtle and difficult + threat to understand and defeat, far more so than threats to + confidentiality and integrity. We therefore urge extreme care in the + design of subnetwork security mechanisms specifically intended to + thwart traffic analysis. + + Subnetwork designers must keep in mind that design and implementation + for security is difficult [Schneier00]. [Schneier95] describes + protocols and algorithms which are considered well-understood and + believed to be sound. + + Poor design process, subtle design errors and flawed implementation + can result in gaping vulnerabilities. In recent years, a number of + subnet standards have had problems exposed. The following are + examples of mistakes that have been made: + + 1. Use of weak and untested algorithms [Crypto9912] [BGW01]. For a + variety of reasons, algorithms were chosen which had subtle + flaws, making them vulnerable to a variety of attacks. + + 2. Use of "security by obscurity" [Schneier00] [Crypto9912]. One + common mistake is to assume that keeping cryptographic algorithms + secret makes them more secure. This is intuitive, but wrong. + Full public disclosure early in the design process attracts peer + review by knowledgeable cryptographers. Exposure of flaws by + this review far outweighs any imagined benefit from forcing + attackers to reverse engineer security algorithms. + + 3. Inclusion of trapdoors [Schneier00] [Crypto9912]. Trapdoors are + flaws surreptitiously left in an algorithm to allow it to be + broken. This might be done to recover lost keys or to permit + surreptitious access by governmental agencies. Trapdoors can be + discovered and exploited by malicious attackers. + + + +Karn, et al. Best Current Practice [Page 43] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + 4. Sending passwords or other identifying information as clear text. + For many years, analog cellular telephones could be cloned and + used to steal service. The cloners merely eavesdropped on the + registration protocols that exchanged everything in clear text. + + 5. Keys which are common to all systems on a subnet [BGW01]. + + 6. Incorrect use of a sound mechanism. For example [BGW01], one + subnet standard includes an initialization vector which is poorly + designed and poorly specified. A determined attacker can easily + recover multiple ciphertexts encrypted with the same key stream + and perform statistical attacks to decipher them. + + 7. Identifying information sent in clear text that can be resolved + to an individual, identifiable device. This creates a + vulnerability to attacks targeted to that device (or its owner). + + 8. Inability to renew and revoke shared secret information. + + 9. Insufficient key length. + + 10. Failure to address "man-in-the-middle" attacks, e.g., with mutual + authentication. + + 11. Failure to provide a form of replay detection, e.g., to prevent a + receiver from accepting packets from an attacker that simply + resends previously captured network traffic. + + 12. Failure to provide integrity mechanisms when providing + confidentiality schemes [Bel98]. + + This list is by no means comprehensive. Design problems are + difficult to avoid, but expert review is generally invaluable in + avoiding problems. + + In addition, well-designed security protocols can be compromised by + implementation defects. Examples of such defects include use of + predictable pseudo-random numbers [RFC1750], vulnerability to buffer + overflow attacks due to unsafe use of certain I/O system calls + [WFBA2000], and inadvertent exposure of secret data. + +19. Contributors + + This document represents a consensus of the members of the IETF + Performance Implications of Link Characteristics (PILC) working + group. + + + + + +Karn, et al. Best Current Practice [Page 44] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + This document would not have been possible without the contributions + of a great number of people in the Performance Implications of Link + Characteristics Working Group. In particular, the following people + provided major contributions of text, editing, and advice on this + document: Mark Allman provided the final editing to complete this + document. Carsten Bormann provided text on robust header + compression. Gorry Fairhurst provided text on broadcast and + multicast issues, routing, and many valuable comments on the entire + document. Aaron Falk provided text on bandwidth on demand. Dan + Grossman provided text on many facets of the document. Reiner Ludwig + provided thorough document review and text on TCP vs. Link-Layer + Retransmission. Jamshid Mahdavi provided text on TCP performance + calculations. Saverio Mascolo provided feedback on the document. + Gabriel Montenegro provided feedback on the document. Marie-Jose + Montpetit provided text on bandwidth on demand. Joe Touch provided + text on multicast, broadcast, and routing, and Lloyd Wood provided + many valuable comments on versions of the document. + +20. Informative References + + References of the form RFCnnnn are Internet Request for Comments + (RFC) documents available online at www.rfc-editor.org. + + [802.1D] Information Technology Telecommunications and + information exchange between systems Local and + metropolitan area networks, Common specifications Media + access control (MAC) bridges, IEEE 802.1D, 1998. ISO + 15802-3. + + [802.1p] IEEE, 802.1p, Standard for Local and Metropolitan Area + Networks - Supplement to Media Access Control (MAC) + Bridges: Traffic Class Expediting and Multicast. + + [AP99] Allman, M. and V. Paxson, On Estimating End-to-End + Network Path Properties, In Proceedings of ACM SIGCOMM + 99. + + [AR02] Acar, G. and C. Rosenberg, Weighted Fair Bandwidth-on- + Demand (WFBoD) for Geo-Stationary Satellite Networks + with On-Board Processing, Computer Networks, 39(1), + 2002. + + [ATMFTM] The ATM Forum, "Traffic Management Specification, + Version 4.0", April 1996, document af-tm-0056.000. + http://www.atmforum.com/ + + + + + + +Karn, et al. Best Current Practice [Page 45] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [BA02] Blanton, E. and M. Allman, On Making TCP More Robust to + Packet Reordering. ACM Computer Communication Review, + 32(1), January 2002. + + [Bel98] Bellovin, S., "Cryptography and the Internet", in + Proceedings of CRYPTO '98, August 1998. + http://www.research.att.com/~smb/papers/inet-crypto.pdf + + [BGW01] Borisov, N., Goldberg, I. and D. Wagner, "Intercepting + Mobile Communications: The Insecurity of 802.11," In + Proceedings of ACM MobiCom, July 2001. + + [BPK98] Balakrishnan, H., Padmanabhan, V. and R. Katz. "The + Effects of Asymmetry on TCP Performance." ACM Mobile + Networks and Applications (MONET), 1998. + + [BPS99] Bennet,, J.C.R., Partridge, C. and N. Shectman, "Packet + Reordering is Not Pathological Network Behavior", + IEEE/ACM Transactions on Networking, Vol. 7, No. 6, + December 1999. + + [CGMP] Farinacci D., Tweedly A. and T. Speakman, "Cisco Group + Management Protocol (CGMP)", 1996/1997. + ftp://ftpeng.cisco.com/ipmulticast/specs/cgmp.txt + + [Crypto9912] Schneier, B., "European Cellular Encryption Algorithms" + Crypto-Gram, December 15, 1999. + http://www.counterpane.com + + [DIX82] Digital Equipment Corp, Intel Corp, Xerox Corp, + Ethernet Local Area Network Specification Version 2.0, + November 1982. + + [DOCSIS1] Data-Over-Cable Service Interface Specifications, Radio + Frequency Interface Specification 1.0, SP-RFI-I05- + 991105, November 1999, Cable Television Laboratories, + Inc. + + [DOCSIS2] Data-Over-Cable Service Interface Specifications, Radio + Frequency Interface Specification 1.1, SP-RFIv1.1-I05- + 000714, July 2000, Cable Television Laboratories, Inc. + + [DOCSIS3] Lai, W.S., "DOCSIS-Based Cable Networks: Impact of + Large Data Packets on Upstream Capacity", 14th ITC + Specialists Seminar on Access Networks and Systems, + Barcelona, Spain, April 25-27, 2001. + + + + + +Karn, et al. Best Current Practice [Page 46] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [EN301192] ETSI, European Broadcasting Union, Digital Video + Broadcasting (DVB); DVB Specification for Data + Broadcasting, European Standard (Telecommunications + Series) EN 301 192 v1.2.1(1999-06). + + [ES00] Eckhardt, D. and P. Steenkiste, "Effort-limited Fair + (ELF) Scheduling for Wireless Networks, Proceedings of + IEEE Infocom 2000. + + [FB00] Firoiu V. and M. Borden, "A Study of Active Queue + Management for Congestion Control" to appear in Infocom + 2000. + + [GM02] Grieco1, L. and S. Mascolo, "TCP Westwood and Easy RED + to Improve Fairness in High-Speed Networks", + Proceedings of the 7th International Workshop on + Protocols for High-Speed Networks, April 2002. + + [IEEE8023] IEEE 802.3 CSMA/CD Access Method. + http://standards.ieee.org/ + + [IEEE80211] IEEE 802.11 Wireless LAN standard. + http://standards.ieee.org/ + + [ISO3309] ISO/IEC 3309:1991(E), "Information Technology - + Telecommunications and information exchange between + systems - High-level data link control (HDLC) + procedures - Frame structure", International + Organization For Standardization, Fourth edition 1991- + 06-01. + + [ISO13818] ISO/IEC, ISO/IEC 13818-1:2000(E) Information + Technology - Generic coding of moving pictures and + associated audio information: Systems, Second edition, + 2000-12-01 International Organization for + Standardization and International Electrotechnical + Commission. + + [ITU-I363] ITU-T I.363.5 B-ISDN ATM Adaptation Layer Specification + Type AAL5, International Standards Organisation (ISO), + 1996. + + [Jac90] Jacobson, V., Modified TCP Congestion Avoidance + Algorithm. Email to the end2end-interest mailing list, + April 1990. + ftp://ftp.ee.lbl.gov/email/vanj.90apr30.txt + + + + + +Karn, et al. Best Current Practice [Page 47] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [KY02] Khafizov, F. and M. Yavuz, Running TCP Over IS-2000, + Proceedings of IEEE ICC, 2002. + + [LK00] Ludwig, R. and R. H. Katz, "The Eifel Algorithm: Making + TCP Robust Against Spurious Retransmissions", ACM + Computer Communication Review, Vol. 30, No. 1, January + 2000. + + [LKJK02] Ludwig, R., Konrad, A., Joseph, A. D. and R. H. Katz, + "Optimizing the End-to-End Performance of Reliable + Flows over Wireless Links", Kluwer/ACM Wireless + Networks Journal, Vol. 8, Nos. 2/3, pp. 289-299, + March-May 2002. + + [LRKOJ99] Ludwig, R., Rathonyi, B., Konrad, A., Oden, K. and A. + Joseph, Multi-Layer Tracing of TCP over a Reliable + Wireless Link, pp. 144-154, In Proceedings of ACM + SIGMETRICS 99. + + [LS00] Ludwig, R. and K. Sklower, The Eifel Retransmission + Timer, ACM Computer Communication Review, Vol. 30, No. + 3, July 2000. + + [MAGMA-PROXY] Fenner, B., He, H., Haberman, B. and H. Sandick, + "IGMP/MLD-based Multicast Forwarding ("IGMP/MLD + Proxying")", Work in Progress. + + [MAGMA-SNOOP] Christensen, M., Kimball, K. and F. Solensky, + "Considerations for IGMP and MLD Snooping Switches", + Work in Progress. + + [MBB00] May, M., Bonald, T. and J-C. Bolot, "Analytic + Evaluation of RED Performance", INFOCOM 2000. + + [MBDL99] May, M., Bolot, J., Diot, C. and B. Lyles, "Reasons not + to deploy RED", Proc. of 7th. International Workshop on + Quality of Service (IWQoS'99), June 1999. + + [MSMO97] Mathis, M., Semke, J., Mahdavi, J. and T. Ott, "The + Macroscopic Behavior of the TCP Congestion Avoidance + Algorithm", Computer Communication Review, Vol. 27, + number 3, July 1997. + + [MYR95] Boden, N., Cohen, D., Felderman, R., Kulawik, A., + Seitz, C., et al. MYRINET: A Gigabit per Second Local + Area Network, IEEE-Micro, Vol. 15, No.1, February 1995, + pp. 29-36. + + + + +Karn, et al. Best Current Practice [Page 48] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [PFTK98] Padhye, J., Firoiu, V., Towsley, D. and J. Kurose, + "Modeling TCP Throughput: a Simple Model and its + Empirical Validation", UMASS CMPSCI Tech Report TR98- + 008, Feb. 1998. + + [RED93] Floyd, S. and V. Jacobson, "Random Early Detection + gateways for Congestion Avoidance", IEEE/ACM + Transactions in Networking, Vol. 1 No. 4, August 1993. + http://www.aciri.org/floyd/papers/red/red.html + + [RF95] Romanow, A. and S. Floyd, "Dynamics of TCP Traffic over + ATM Networks". IEEE Journal of Selected Areas in + Communication, Vol.13 No. 4, May 1995, p. 633-641. + + [RFC791] Postel, J., "Internet Protocol", STD 5, RFC 791, + September 1981. + + [RFC793] Postel, J., "Transmission Control Protocol", STD 7, RFC + 793, September 1981. + + [RFC768] Postel, J., "User Datagram Protocol", STD 6, RFC 768, + August 1980. + + [RFC826] Plummer, D.C., "Ethernet Address Resolution Protocol: + Or converting network protocol addresses to 48-bit + Ethernet address for transmission on Ethernet + hardware", STD 37, RFC 826, November 1982. + + [RFC1071] Braden, R., Borman, D. and C. Partridge, "Computing the + Internet checksum", RFC 1071, September 1988. + + [RFC1112] Deering, S., "Host Extensions for IP Multicasting", STD + 5, RFC 1112, August 1989. + + [RFC1144] Jacobson, V., "Compressing TCP/IP Headers for Low-Speed + Serial Links", RFC 1144, February 1990. + + [RFC1191] Mogul, J. and S. Deering, "Path MTU Discovery", RFC + 1191, November 1990. + + [RFC1332] McGregor, C., "The PPP Internet Protocol Control + Protocol (IPCP)", RFC 1332, May 1992. + + [RFC1435] Knowles, S., "IESG Advice from Experience with Path MTU + Discovery", RFC 1435, March 1993. + + + + + + +Karn, et al. Best Current Practice [Page 49] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [RFC1633] Braden, R., Clark, D. and S. Shenker, "Integrated + Services in the Internet Architecture: an Overview", + RFC 1633, June 1994. + + [RFC1661] Simpson, W., "The Point-to-Point Protocol (PPP)", STD + 51, RFC 1661, July 1994. + + [RFC1662] Simpson, W., Ed., "PPP in HDLC-like Framing", STD 51, + RFC 1662, July 1994. + + [RFC1750] Eastlake 3rd, D., Crocker, S. and J. Schiller, + "Randomness Recommendations for Security", RFC 1750, + December 1994. + + [RFC1812] Baker, F., Ed., "Requirements for IP Version 4 + Routers", RFC 1812, June 1995. + + [RFC1939] Myers, J. and M. Rose, "Post Office Protocol - Version + 3", STD 53, RFC 1939, May 1996. + + [RFC1981] McCann, J., Deering, S. and J. Mogul, "Path MTU + Discovery for IP version 6", RFC 1981, August 1996. + + [RFC1991] Atkins, D., Stallings, W. and P. Zimmermann, "PGP + Message Exchange Formats", RFC 1991, August 1996. + + [RFC2018] Mathis, M., Mahdavi, J., Floyd, S. and A. Romanow, "TCP + Selective Acknowledgement Options", RFC 2018, October + 1996. + + [RFC2131] Droms, R., "Dynamic Host Configuration Protocol", RFC + 2131, March 1997. + + [RFC2205] Braden, R., Ed., Zhang, L., Berson, S., Herzog, S. and + S. Jamin, "Resource ReSerVation Protocol (RSVP) -- + Version 1 Functional Specification", RFC 2205, + September 1997. + + [RFC2208] Mankin, A., Baker, F., Braden, B., Bradner, S., O`Dell, + M., Romanow, A., Weinrib, A. and L. Zhang, "Resource + ReSerVation Protocol (RSVP) -- Version 1 Applicability + Statement Some Guidelines on Deployment", RFC 2208, + September 1997. + + [RFC2210] Wroclawski, J., "The Use of RSVP with IETF Integrated + Services", RFC 2210, September 1997. + + + + + +Karn, et al. Best Current Practice [Page 50] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [RFC2211] Wroclawski, J., "Specification of the Controlled-Load + Network Element Service", RFC 2211, September 1997. + + [RFC2212] Shenker, S., Partridge, C. and R. Guerin, + "Specification of Guaranteed Quality of Service", RFC + 2212, September 1997. + + [RFC2246] Dierks, T. and C. Allen, "The TLS Protocol Version + 1.0", RFC 2246, January 1999. + + [RFC2309] Braden, B., Clark, D., Crowcroft, J., Davie, B., + Deering, S., Estrin, D., Floyd, S., Jacobson, V., + Minshall, G., Partridge, C., Peterson, L., + Ramakrishnan, K., Shenker, S., Wroclawski, J. and L. + Zhang, "Recommendations on Queue Management and + Congestion Avoidance in the Internet", RFC 2309, April + 1998. + + [RFC2322] van den Hout, K., Koopal, A. and R. van Mook, + "Management of IP numbers by peg-dhcp", RFC 2322, 1 + April 1998. + + [RFC2328] Moy, J., "OSPF Version 2", STD 54, RFC 2328, April + 1998. + + [RFC2332] Luciani, J., Katz, D., Piscitello, D., Cole, B. and N. + Doraswamy, "NBMA Next Hop Resolution Protocol (NHRP)", + RFC 2332, April 1998. + + [RFC2364] Gross, G., Kaycee, M., Li, A., Malis, A. and J. + Stephens, "PPP Over AAL5", RFC 2364, July 1998. + + [RFC2394] Pereira, R., "IP Payload Compression Using DEFLATE", + RFC 2394, December 1998. + + [RFC2395] Friend, R. and R. Monsour, "IP Payload Compression + Using LZS", RFC 2395, December 1998. + + [RFC2401] Kent, S. and R. Atkinson, "Security Architecture for + the Internet Protocol", RFC 2401, November 1998. + + [RFC2406] Kent, S. and R. Atkinson, "IP Encapsulating Security + Payload (ESP)", RFC 2406, November 1998. + + [RFC2440] Callas, J., Donnerhacke, L., Finney, H. and R. Thayer, + "OpenPGP Message Format", RFC 2440, November 1998. + + + + + +Karn, et al. Best Current Practice [Page 51] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [RFC2460] Deering, S. and R. Hinden, "Internet Protocol, Version + 6 (IPv6) Specification", RFC 2460, December 1998. + + [RFC2461] Narten, T., Nordmark, E. and W. Simpson, "Neighbor + Discovery for IP Version 6 (IPv6)", RFC 2461, December + 1998. + + [RFC2474] Nichols, K., Blake, S., Baker, F. and D. Black, + "Definition of the Differentiated Services Field (DS + Field) in the IPv4 and IPv6 Headers", RFC 2474, + December 1998. + + [RFC2475] Blake, S., Black, D., Carlson, M., Davies, E., Wang, Z. + and W. Weiss, "An Architecture for Differentiated + Services", RFC 2475, December 1998. + + [RFC2507] Degermark, M., Nordgren, B. and S. Pink, "IP Header + Compression", RFC 2507, February 1999. + + [RFC2508] Casner, S. and V. Jacobson, "Compressing IP/UDP/RTP + Headers for Low-Speed Serial Links", RFC 2508, February + 1999. + + [RFC2581] Allman, M., Paxson, V. and W. Stevens, "TCP Congestion + Control", RFC 2581, April 1999. + + [RFC2582] Floyd, S. and T. Henderson, "The NewReno Modification + to TCP's Fast Recovery Algorithm", RFC 2582, April + 1999. + + [RFC2597] Heinanen, J., Baker, F., Weiss, W. and J. Wroclawski, + "Assured Forwarding PHB Group", RFC 2597, June 1999. + + [RFC2616] Fielding, R., Gettys, J., Mogul, J., Frystyk, H., + Masinter, L., Leach, P. and T. Berners-Lee, "Hypertext + Transfer Protocol -- HTTP/1.1", RFC 2616, June 1999. + + [RFC2630] Housley, R., "Cryptographic Message Syntax", RFC 2630, + June 1999. + + [RFC2631] Rescorla, E., "Diffie-Hellman Key Agreement Method", + RFC 2631, June 1999. + + [RFC2632] Ramsdell, B., Ed., "S/MIME Version 3 Certificate + Handling", RFC 2632, June 1999. + + + + + + +Karn, et al. Best Current Practice [Page 52] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [RFC2633] Ramsdell, B., "S/MIME Version 3 Message Specification", + RFC 2633, June 1999. + + [RFC2634] Hoffman, P., "Enhanced Security Services for S/MIME", + RFC 2634, June 1999. + + [RFC2684] Grossman, D. and J. Heinanen, "Multiprotocol + Encapsulation over ATM Adaptation Layer 5", RFC 2684, + September 1999. + + [RFC2686] Bormann, C., "The Multi-Class Extension to Multi-Link + PPP", RFC 2686, September 1999. + + [RFC2687] Bormann, C., "PPP in a Real-time Oriented HDLC-like + Framing", RFC 2687, September 1999. + + [RFC2689] Bormann, C., "Providing Integrated Services over Low- + bitrate Links", RFC 2689, September 1999. + + [RFC2710] Deering, S., Fenner, W. and B. Haberman, "Multicast + Listener Discovery (MLD) for IPv6", RFC 2710, October + 1999. + + [RFC2784] Farinacci, D., Li, T., Hanks, S., Meyer, D. and P. + Traina, "Generic Routing Encapsulation (GRE)", RFC + 2784, March 2000. + + [RFC2865] Rigney, C., Willens, S., Rubens, A. and W. Simpson, + "Remote Authentication Dial In User Service (RADIUS)", + RFC 2865, June 2000. + + [RFC2914] Floyd, S., "Congestion Control Principles", BCP 41, RFC + 2914, September 2000. + + [RFC2923] Lahey, K., "TCP Problems with Path MTU Discovery", RFC + 2923, September 2000. + + [RFC2988] Paxson, V. and M. Allman, "Computing TCP's + Retransmission Timer", RFC 2988, November 2000. + + [RFC2990] Huston, G., "Next Steps for the IP QoS Architecture", + RFC 2990, November 2000. + + [RFC3048] Whetten, B., Vicisano, L., Kermode, R., Handley, M., + Floyd, S. and M. Luby, "Reliable Multicast Transport + Building Blocks for One-to-Many Bulk-Data Transfer", + RFC 3048, January 2001. + + + + +Karn, et al. Best Current Practice [Page 53] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [RFC3095] Bormann, C., Ed., Burmeister, C., Degermark, M., + Fukushima, H., Hannu, H., Jonsson, L-E., Hakenberg, R., + Koren, T., Le, K., Liu, Z., Martensson, A., Miyazaki, + A., Svanbro, K., Wiebke, T., Yoshimura, T. and H. + Zheng, "RObust Header Compression (ROHC): Framework + and four profiles: RTP, UDP, ESP, and uncompressed", + RFC 3095, July 2001. + + [RFC3096] Degermark, M., Ed., "Requirements for robust IP/UDP/RTP + header compression", RFC 3096, July 2001. + + [RFC3150] Dawkins, S., Montenegro, G., Kojo, M. and V. Magret, + "End-to-end Performance Implications of Slow Links", + BCP 48, RFC 3150, July 2001. + + [RFC3155] Dawkins, S., Montenegro, G., Kojo, M., Magret, V. and + N. Vaidya, "End-to-end Performance Implications of + Links with Errors", BCP 50, RFC 3155, August 2001. + + [RFC3168] Ramakrishnan, K., Floyd, S. and D. Black, "The Addition + of Explicit Congestion Notification (ECN) to IP", RFC + 3168, September 2001. + + [RFC3173] Shacham, A., Monsour, B., Pereira, R. and M. Thomas, + "IP Payload Compression Protocol (IPComp)", RFC 3173, + September 2001. + + [RFC3246] Davie, B., Charny, A., Bennet, J.C.R., Benson, K., Le + Boudec, J.Y., Courtney, W., Davari, S., Firoiu, V. and + D. Stiliadis, "An Expedited Forwarding PHB (Per-Hop + Behavior)", RFC 3246, March 2002. + + [RFC3248] Armitage, G., Carpenter, B., Casati, A., Crowcroft, J., + Halpern, J., Kumar, B. and J. Schnizlein, "A Delay + Bound alternative revision of RFC 2598", RFC 3248, + March 2002. + + [RFC3344] Perkins, C., Ed., "IP Mobility Support for IPv4", RFC + 3344, August 2002. + + [RFC3366] Fairhurst, G. and L. Wood, "Advice to link designers on + link Automatic Repeat reQuest (ARQ)", BCP 62, RFC 3366, + August 2002. + + + + + + + + +Karn, et al. Best Current Practice [Page 54] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [RFC3376] Cain, B., Deering, S., Kouvelas, I., Fenner, B. and A. + Thyagarajan, "Internet Group Management Protocol, + Version 3", RFC 3376, October 2002. + + [RFC3449] Balakrishnan, H., Padmanabhan, V., Fairhurst, G. and M. + Sooriyabandara, "TCP Performance Implications of + Network Path Asymmetry", BCP 69, RFC 3449, December + 2002. + + [RFC3450] Luby, M., Gemmell, J., Vicisano, L., Rizzo, L. and J. + Crowcroft, "Asynchronous Layered Coding (ALC) Protocol + Instantiation", RFC 3450, December 2002. + + [RFC3451] Luby, M., Gemmell, J., Vicisano, L., Rizzo, L., + Handley, M. and J. Crowcroft, "Layered Coding Transport + (LCT) Building Block", RFC 3451, December 2002. + + [RFC3452] Luby, M., Vicisano, L., Gemmell, J., Rizzo, L., + Handley, M. and J. Crowcroft, "Forward Error Correction + (FEC) Building Block", RFC 3452, December 2002. + + [RFC3453] Luby, M., Vicisano, L., Gemmell, J., Rizzo, L., + Handley, M. and J. Crowcroft, "The Use of Forward Error + Correction (FEC) in Reliable Multicast", RFC 3453, + December 2002. + + [RFC3488] Wu, I. and T. Eckert, "Cisco Systems Router-port Group + Management Protocol (RGMP)", RFC 3488, February 2003. + + [RFC3501] Crispin, M., "INTERNET MESSAGE ACCESS PROTOCOL - + VERSION 4rev1", RFC 3501, March 2003. + + [RFC3828] Larzon, L-A., Degermark, M., Pink, S., Jonsson, L-E., + Ed. and G. Fairhurst, Ed., "The User Datagram Protocol + (UDP)-Lite Protocol", RFC 3828, June 2004. + + [Schneier95] Schneier, B., Applied Cryptography: Protocols, + Algorithms and Source Code in C (John Wiley and Sons, + October 1995). + + [Schneier00] Schneier, B., Secrets and Lies: Digital Security in a + Networked World (John Wiley and Sons, August 2000). + + [SP2000] Stone, J. and C. Partridge, "When the CRC and TCP + Checksum Disagree", ACM SIGCOMM, September 2000. + http://www.acm.org/sigcomm/sigcomm2000/conf/ + paper/sigcomm2000-9-1.pdf + + + + +Karn, et al. Best Current Practice [Page 55] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + [SRC81] Saltzer, J., Reed D. and D. Clark, "End-to-End + Arguments in System Design". Second International + Conference on Distributed Computing Systems (April, + 1981) pages 509-512. Published with minor changes in + ACM Transactions in Computer Systems 2, 4, November, + 1984, pages 277-288. Reprinted in Craig Partridge, + editor Innovations in internetworking. Artech House, + Norwood, MA, 1988, pages 195-206. ISBN 0-89006-337-0. + + [SSL2] Hickman, K., "The SSL Protocol", Netscape + Communications Corp., Feb 9, 1995. + + [SSL3] Frier, A., Karlton, P. and P. Kocher, "The SSL 3.0 + Protocol", Netscape Communications Corp., Nov 18, 1996. + + [TCPF98] Lin, D. and H.T. Kung, "TCP Fast Recovery Strategies: + Analysis and Improvements", IEEE Infocom, March 1998. + http://www.eecs.harvard.edu/networking/papers/infocom- + tcp-final-198.pdf + + [WFBA2000] Wagner, D., Foster, J., Brewer, E. and A. Aiken, "A + First Step Toward Automated Detection of Buffer Overrun + Vulnerabilities", Proceedings of NDSS2000. + http://www.isoc.org/isoc/conferences/ndss/ + 2000/proceedings/039.pdf + + [Wilbur89] Wilbur, Steve R., Jon Crowcroft, and Yuko Murayama. + "MAC layer Security Measures in Local Area Networks", + Local Area Network Security, Workshop LANSEC '89 + Proceedings, Springer-Verlag, April 1989, pp. 53-64. + + + + + + + + + + + + + + + + + + + + + +Karn, et al. Best Current Practice [Page 56] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +21. Contributors' Addresses + + Aaron Falk + USC/Information Sciences Institute + 4676 Admiralty Way + Marina Del Rey, CA 90292 + + Phone: 310-448-9327 + EMail: falk@isi.edu + + + Saverio Mascolo + Dipartimento di Elettrotecnica ed Elettronica, + Politecnico di Bari Via Orabona 4, 70125 Bari, Italy + + Phone: +39 080 596 3621 + EMail: mascolo@poliba.it + URL: http://www-dee.poliba.it/dee-web/Personale/mascolo.html + + + Marie-Jose Montpetit + MJMontpetit.com + + EMail: marie@mjmontpetit.com + + + + + + + + + + + + + + + + + + + + + + + + + + + +Karn, et al. Best Current Practice [Page 57] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +22. Authors' Addresses + + Phil Karn, Editor + Qualcomm 5775 Morehouse Drive + San Diego CA 92121 + + Phone: 858 587 1121 + EMail: karn@qualcomm.com + + + Carsten Bormann + Universitaet Bremen TZI + Postfach 330440 + D-28334 Bremen, Germany + + Phone: +49 421 218 7024 + Fax: +49 421 218 7000 + EMail: cabo@tzi.org + + + Godred (Gorry) Fairhurst + Department of Engineering, University of Aberdeen, + Aberdeen, AB24 3UE, United Kingdom + + EMail: gorry@erg.abdn.ac.uk + URL: http://www.erg.abdn.ac.uk/users/gorry + + + Dan Grossman + Motorola, Inc. + 111 Locke Drive + Marlboro, MA 01752 + + EMail: Dan.Grossman@motorola.com + + + Reiner Ludwig + Ericsson Research + Ericsson Allee + 1 52134 Herzogenrath, Germany + + Phone: +49 2407 575 719 + EMail: Reiner.Ludwig@ericsson.com + + + + + + + + +Karn, et al. Best Current Practice [Page 58] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + + Jamshid Mahdavi + Novell, Inc. + + EMail: jmahdavi@earthlink.net + + + Gabriel Montenegro + Sun Microsystems Laboratories, Europe + 180, Avenue de l'Europe + 38334 Saint Ismier CEDEX + France + + EMail: gab@sun.com + + + Joe Touch + USC/Information Sciences Institute + 4676 Admiralty Way + Marina del Rey CA 90292 + + Phone: 310 448 9151 + EMail: touch@isi.edu + URL: http://www.isi.edu/touch + + + Lloyd Wood + Cisco Systems + 9 New Square Park, Bedfont Lakes + Feltham TW14 8HA + United Kingdom + + Phone: +44 (0)20 8824 4236 + EMail: lwood@cisco.com + URL: http://www.ee.surrey.ac.uk/Personal/L.Wood/ + + + + + + + + + + + + + + + + + +Karn, et al. Best Current Practice [Page 59] + +RFC 3819 Advice for Internet Subnetwork Designers July 2004 + + +23. Full Copyright Statement + + Copyright (C) The Internet Society (2004). This document is subject + to the rights, licenses and restrictions contained in BCP 78, and + except as set forth therein, the authors retain all their rights. + + This document and the information contained herein are provided on an + "AS IS" basis and THE CONTRIBUTOR, THE ORGANIZATION HE/SHE + REPRESENTS OR IS SPONSORED BY (IF ANY), THE INTERNET SOCIETY AND THE + INTERNET ENGINEERING TASK FORCE DISCLAIM ALL WARRANTIES, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF + THE INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED + WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Intellectual Property + + The IETF takes no position regarding the validity or scope of any + Intellectual Property Rights or other rights that might be claimed + to pertain to the implementation or use of the technology + described in this document or the extent to which any license + under such rights might or might not be available; nor does it + represent that it has made any independent effort to identify any + such rights. Information on the procedures with respect to + rights in RFC documents can be found in BCP 78 and BCP 79. + + Copies of IPR disclosures made to the IETF Secretariat and any + assurances of licenses to be made available, or the result of an + attempt made to obtain a general license or permission for the use + of such proprietary rights by implementers or users of this + specification can be obtained from the IETF on-line IPR repository + at http://www.ietf.org/ipr. + + The IETF invites any interested party to bring to its attention + any copyrights, patents or patent applications, or other + proprietary rights that may cover technology that may be required + to implement this standard. Please address the information to the + IETF at ietf-ipr@ietf.org. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + + + +Karn, et al. Best Current Practice [Page 60] + diff --git a/ext/picotcp/RFC/rfc3927.txt b/ext/picotcp/RFC/rfc3927.txt new file mode 100644 index 0000000..466b9eb --- /dev/null +++ b/ext/picotcp/RFC/rfc3927.txt @@ -0,0 +1,1851 @@ + + + + + + +Network Working Group S. Cheshire +Request for Comments: 3927 Apple Computer +Category: Standards Track B. Aboba + Microsoft Corporation + E. Guttman + Sun Microsystems + May 2005 + + + Dynamic Configuration of IPv4 Link-Local Addresses + +Status of This Memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2005). + + +Abstract + + To participate in wide-area IP networking, a host needs to be + configured with IP addresses for its interfaces, either manually by + the user or automatically from a source on the network such as a + Dynamic Host Configuration Protocol (DHCP) server. Unfortunately, + such address configuration information may not always be available. + It is therefore beneficial for a host to be able to depend on a + useful subset of IP networking functions even when no address + configuration is available. This document describes how a host may + automatically configure an interface with an IPv4 address within the + 169.254/16 prefix that is valid for communication with other devices + connected to the same physical (or logical) link. + + IPv4 Link-Local addresses are not suitable for communication with + devices not directly connected to the same physical (or logical) + link, and are only used where stable, routable addresses are not + available (such as on ad hoc or isolated networks). This document + does not recommend that IPv4 Link-Local addresses and routable + addresses be configured simultaneously on the same interface. + + + + + + + +Cheshire, et al. Standards Track [Page 1] + +RFC 3927 IPv4 Link-Local May 2005 + + +Table of Contents + + 1. Introduction. . . . . . . . . . . . . . . . . . . . . . . . . 3 + 1.1. Requirements. . . . . . . . . . . . . . . . . . . . . . 3 + 1.2. Terminology . . . . . . . . . . . . . . . . . . . . . . 3 + 1.3. Applicability . . . . . . . . . . . . . . . . . . . . . 5 + 1.4. Application Layer Protocol Considerations . . . . . . . 6 + 1.5. Autoconfiguration Issues. . . . . . . . . . . . . . . . 7 + 1.6. Alternate Use Prohibition . . . . . . . . . . . . . . . 7 + 1.7. Multiple Interfaces . . . . . . . . . . . . . . . . . . 8 + 1.8. Communication with Routable Addresses . . . . . . . . . 8 + 1.9. When to configure an IPv4 Link-Local Address. . . . . . 8 + 2. Address Selection, Defense and Delivery . . . . . . . . . . . 9 + 2.1. Link-Local Address Selection. . . . . . . . . . . . . . 10 + 2.2. Claiming a Link-Local Address . . . . . . . . . . . . . 11 + 2.3. Shorter Timeouts. . . . . . . . . . . . . . . . . . . . 13 + 2.4. Announcing an Address . . . . . . . . . . . . . . . . . 13 + 2.5. Conflict Detection and Defense. . . . . . . . . . . . . 13 + 2.6. Address Usage and Forwarding Rules. . . . . . . . . . . 14 + 2.7. Link-Local Packets Are Not Forwarded. . . . . . . . . . 16 + 2.8. Link-Local Packets are Local. . . . . . . . . . . . . . 16 + 2.9. Higher-Layer Protocol Considerations. . . . . . . . . . 17 + 2.10. Privacy Concerns. . . . . . . . . . . . . . . . . . . . 17 + 2.11. Interaction between DHCPv4 and IPv4 Link-Local + State Machines. . . . . . . . . . . . . . . . . . . . . 17 + 3. Considerations for Multiple Interfaces. . . . . . . . . . . . 18 + 3.1. Scoped Addresses. . . . . . . . . . . . . . . . . . . . 18 + 3.2. Address Ambiguity . . . . . . . . . . . . . . . . . . . 19 + 3.3. Interaction with Hosts with Routable Addresses. . . . . 20 + 3.4. Unintentional Autoimmune Response . . . . . . . . . . . 21 + 4. Healing of Network Partitions . . . . . . . . . . . . . . . . 22 + 5. Security Considerations . . . . . . . . . . . . . . . . . . . 23 + 6. Application Programming Considerations. . . . . . . . . . . . 24 + 6.1. Address Changes, Failure and Recovery . . . . . . . . . 24 + 6.2. Limited Forwarding of Locators. . . . . . . . . . . . . 24 + 6.3. Address Ambiguity . . . . . . . . . . . . . . . . . . . 25 + 7. Router Considerations . . . . . . . . . . . . . . . . . . . . 25 + 8. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 25 + 9. Constants . . . . . . . . . . . . . . . . . . . . . . . . . . 26 + 10. References. . . . . . . . . . . . . . . . . . . . . . . . . . 26 + 10.1. Normative References. . . . . . . . . . . . . . . . . . 26 + 10.2. Informative References. . . . . . . . . . . . . . . . . 26 + Acknowledgments . . . . . . . . . . . . . . . . . . . . . . . . . 27 + Appendix A - Prior Implementations. . . . . . . . . . . . . . . . 28 + + + + + + + +Cheshire, et al. Standards Track [Page 2] + +RFC 3927 IPv4 Link-Local May 2005 + + +1. Introduction + + As the Internet Protocol continues to grow in popularity, it becomes + increasingly valuable to be able to use familiar IP tools such as FTP + not only for global communication, but for local communication as + well. For example, two people with laptop computers supporting IEEE + 802.11 Wireless LANs [802.11] may meet and wish to exchange files. + It is desirable for these people to be able to use IP application + software without the inconvenience of having to manually configure + static IP addresses or set up a DHCP server [RFC2131]. + + This document describes a method by which a host may automatically + configure an interface with an IPv4 address in the 169.254/16 prefix + that is valid for Link-Local communication on that interface. This + is especially valuable in environments where no other configuration + mechanism is available. The IPv4 prefix 169.254/16 is registered + with the IANA for this purpose. Allocation of IPv6 Link-Local + addresses is described in "IPv6 Stateless Address Autoconfiguration" + [RFC2462]. + + Link-Local communication using IPv4 Link-Local addresses is only + suitable for communication with other devices connected to the same + physical (or logical) link. Link-Local communication using IPv4 + Link-Local addresses is not suitable for communication with devices + not directly connected to the same physical (or logical) link. + + Microsoft Windows 98 (and later) and Mac OS 8.5 (and later) already + support this capability. This document standardizes usage, + prescribing rules for how IPv4 Link-Local addresses are to be treated + by hosts and routers. In particular, it describes how routers are to + behave when receiving packets with IPv4 Link-Local addresses in the + source or destination address. With respect to hosts, it discusses + claiming and defending addresses, maintaining Link-Local and routable + IPv4 addresses on the same interface, and multi-homing issues. + +1.1. Requirements + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in "Key words for use in + RFCs" [RFC2119]. + +1.2. Terminology + + This document describes Link-Local addressing, for IPv4 communication + between two hosts on a single link. A set of hosts is considered to + be "on the same link", if: + + + + +Cheshire, et al. Standards Track [Page 3] + +RFC 3927 IPv4 Link-Local May 2005 + + + - when any host A from that set sends a packet to any other host B + in that set, using unicast, multicast, or broadcast, the entire + link-layer packet payload arrives unmodified, and + + - a broadcast sent over that link by any host from that set of hosts + can be received by every other host in that set + + The link-layer *header* may be modified, such as in Token Ring Source + Routing [802.5], but not the link-layer *payload*. In particular, if + any device forwarding a packet modifies any part of the IP header or + IP payload then the packet is no longer considered to be on the same + link. This means that the packet may pass through devices such as + repeaters, bridges, hubs or switches and still be considered to be on + the same link for the purpose of this document, but not through a + device such as an IP router that decrements the TTL or otherwise + modifies the IP header. + + This document uses the term "routable address" to refer to all valid + unicast IPv4 addresses outside the 169.254/16 prefix that may be + forwarded via routers. This includes all global IP addresses and + private addresses such as Net 10/8 [RFC1918], but not loopback + addresses such as 127.0.0.1. + + Wherever this document uses the term "host" when describing use of + IPv4 Link-Local addresses, the text applies equally to routers when + they are the source of or intended destination of packets containing + IPv4 Link-Local source or destination addresses. + + Wherever this document uses the term "sender IP address" or "target + IP address" in the context of an ARP packet, it is referring to the + fields of the ARP packet identified in the ARP specification [RFC826] + as "ar$spa" (Sender Protocol Address) and "ar$tpa" (Target Protocol + Address) respectively. For the usage of ARP described in this + document, each of these fields always contains an IP address. + + In this document, the term "ARP Probe" is used to refer to an ARP + Request packet, broadcast on the local link, with an all-zero 'sender + IP address'. The 'sender hardware address' MUST contain the hardware + address of the interface sending the packet. The 'target hardware + address' field is ignored and SHOULD be set to all zeroes. The + 'target IP address' field MUST be set to the address being probed. + + In this document, the term "ARP Announcement" is used to refer to an + ARP Request packet, broadcast on the local link, identical to the ARP + Probe described above, except that both the sender and target IP + address fields contain the IP address being announced. + + + + + +Cheshire, et al. Standards Track [Page 4] + +RFC 3927 IPv4 Link-Local May 2005 + + + Constants are introduced in all capital letters. Their values are + given in Section 9. + +1.3. Applicability + + This specification applies to all IEEE 802 Local Area Networks (LANs) + [802], including Ethernet [802.3], Token-Ring [802.5] and IEEE 802.11 + wireless LANs [802.11], as well as to other link-layer technologies + that operate at data rates of at least 1 Mbps, have a round-trip + latency of at most one second, and support ARP [RFC826]. Wherever + this document uses the term "IEEE 802", the text applies equally to + any of these network technologies. + + Link-layer technologies that support ARP but operate at rates below 1 + Mbps or latencies above one second may need to specify different + values for the following parameters: + + (a) the number of, and interval between, ARP probes, see PROBE_NUM, + PROBE_MIN, PROBE_MAX defined in Section 2.2.1 + + (b) the number of, and interval between, ARP announcements, see + ANNOUNCE_NUM and ANNOUNCE_INTERVAL defined in Section 2.4 + + (c) the maximum rate at which address claiming may be attempted, see + RATE_LIMIT_INTERVAL and MAX_CONFLICTS defined in Section 2.2.1 + + (d) the time interval between conflicting ARPs below which a host + MUST reconfigure instead of attempting to defend its address, see + DEFEND_INTERVAL defined in Section 2.5 + + Link-layer technologies that do not support ARP may be able to use + other techniques for determining whether a particular IP address is + currently in use. However, the application of claim-and-defend + mechanisms to such networks is outside the scope of this document. + + This specification is intended for use with small ad hoc networks -- + a single link containing only a few hosts. Although 65024 IPv4 + Link-Local addresses are available in principle, attempting to use + all those addresses on a single link would result in a high + probability of address conflicts, requiring a host to take an + inordinate amount of time to find an available address. + + Network operators with more than 1300 hosts on a single link may want + to consider dividing that single link into two or more subnets. A + host connecting to a link that already has 1300 hosts, selecting an + IPv4 Link-Local address at random, has a 98% chance of selecting an + unused IPv4 Link-Local address on the first try. A host has a 99.96% + + + + +Cheshire, et al. Standards Track [Page 5] + +RFC 3927 IPv4 Link-Local May 2005 + + + chance of selecting an unused IPv4 Link-Local address within two + tries. The probability that it will have to try more than ten times + is about 1 in 10^17. + +1.4. Application Layer Protocol Considerations + + IPv4 Link-Local addresses and their dynamic configuration have + profound implications upon applications which use them. This is + discussed in Section 6. Many applications fundamentally assume that + addresses of communicating peers are routable, relatively unchanging + and unique. These assumptions no longer hold with IPv4 Link-Local + addresses, or a mixture of Link-Local and routable IPv4 addresses. + + Therefore while many applications will work properly with IPv4 Link- + Local addresses, or a mixture of Link-Local and routable IPv4 + addresses, others may do so only after modification, or will exhibit + reduced or partial functionality. + + In some cases it may be infeasible for the application to be modified + to operate under such conditions. + + IPv4 Link-Local addresses should therefore only be used where stable, + routable addresses are not available (such as on ad hoc or isolated + networks) or in controlled situations where these limitations and + their impact on applications are understood and accepted. This + document does not recommend that IPv4 Link-Local addresses and + routable addresses be configured simultaneously on the same + interface. + + Use of IPv4 Link-Local addresses in off-link communication is likely + to cause application failures. This can occur within any application + that includes embedded addresses, if an IPv4 Link-Local address is + embedded when communicating with a host that is not on the link. + Examples of applications that embed addresses include IPsec, Kerberos + 4/5, FTP, RSVP, SMTP, SIP, X-Windows/Xterm/Telnet, Real Audio, H.323, + and SNMP [RFC3027]. + + To preclude use of IPv4 Link-Local addresses in off-link + communication, the following cautionary measures are advised: + + a. IPv4 Link-Local addresses MUST NOT be configured in the DNS. + Mapping from IPv4 addresses to host names is conventionally done + by issuing DNS queries for names of the form, + "x.x.x.x.in-addr.arpa." When used for link-local addresses, which + have significance only on the local link, it is inappropriate to + send such DNS queries beyond the local link. DNS clients MUST NOT + send DNS queries for any name that falls within the + "254.169.in-addr.arpa." domain. + + + +Cheshire, et al. Standards Track [Page 6] + +RFC 3927 IPv4 Link-Local May 2005 + + + DNS recursive name servers receiving queries from non-compliant + clients for names within the "254.169.in-addr.arpa." domain MUST + by default return RCODE 3, authoritatively asserting that no such + name exists in the Domain Name System. + + b. Names that are globally resolvable to routable addresses should be + used within applications whenever they are available. Names that + are resolvable only on the local link (such as through use of + protocols such as Link Local Multicast Name Resolution [LLMNR]) + MUST NOT be used in off-link communication. IPv4 addresses and + names that can only be resolved on the local link SHOULD NOT be + forwarded beyond the local link. IPv4 Link-Local addresses SHOULD + only be sent when a Link-Local address is used as the source + and/or destination address. This strong advice should hinder + limited scope addresses and names from leaving the context in + which they apply. + + c. If names resolvable to globally routable addresses are not + available, but the globally routable addresses are, they should be + used instead of IPv4 Link-Local addresses. + +1.5. Autoconfiguration Issues + + Implementations of IPv4 Link-Local address autoconfiguration MUST + expect address conflicts, and MUST be prepared to handle them + gracefully by automatically selecting a new address whenever a + conflict is detected, as described in Section 2. This requirement to + detect and handle address conflicts applies during the entire period + that a host is using a 169.254/16 IPv4 Link-Local address, not just + during initial interface configuration. For example, address + conflicts can occur well after a host has completed booting if two + previously separate networks are joined, as described in Section 4. + +1.6. Alternate Use Prohibition + + Note that addresses in the 169.254/16 prefix SHOULD NOT be configured + manually or by a DHCP server. Manual or DHCP configuration may cause + a host to use an address in the 169.254/16 prefix without following + the special rules regarding duplicate detection and automatic + configuration that pertain to addresses in this prefix. While the + DHCP specification [RFC2131] indicates that a DHCP client SHOULD + probe a newly received address with ARP, this is not mandatory. + Similarly, while the DHCP specification recommends that a DHCP server + SHOULD probe an address using an ICMP Echo Request before allocating + it, this is also not mandatory, and even if the server does this, + IPv4 Link-Local addresses are not routable, so a DHCP server not + directly connected to a link cannot detect whether a host on that + link is already using the desired IPv4 Link-Local address. + + + +Cheshire, et al. Standards Track [Page 7] + +RFC 3927 IPv4 Link-Local May 2005 + + + Administrators wishing to configure their own local addresses (using + manual configuration, a DHCP server, or any other mechanism not + described in this document) should use one of the existing private + address prefixes [RFC1918], not the 169.254/16 prefix. + +1.7. Multiple Interfaces + + Additional considerations apply to hosts that support more than one + active interface where one or more of these interfaces support IPv4 + Link-Local address configuration. These considerations are discussed + in Section 3. + +1.8. Communication with Routable Addresses + + There will be cases when devices with a configured Link-Local address + will need to communicate with a device with a routable address + configured on the same physical link, and vice versa. The rules in + Section 2.6 allow this communication. + + This allows, for example, a laptop computer with only a routable + address to communicate with web servers world-wide using its + globally-routable address while at the same time printing those web + pages on a local printer that has only an IPv4 Link-Local address. + +1.9. When to configure an IPv4 Link-Local address + + Having addresses of multiple different scopes assigned to an + interface, with no adequate way to determine in what circumstances + each address should be used, leads to complexity for applications and + confusion for users. A host with an address on a link can + communicate with all other devices on that link, whether those + devices use Link-Local addresses, or routable addresses. For these + reasons, a host SHOULD NOT have both an operable routable address and + an IPv4 Link-Local address configured on the same interface. The + term "operable address" is used to mean an address which works + effectively for communication in the current network context (see + below). When an operable routable address is available on an + interface, the host SHOULD NOT also assign an IPv4 Link-Local address + on that interface. However, during the transition (in either + direction) between using routable and IPv4 Link-Local addresses both + MAY be in use at once subject to these rules: + + 1. The assignment of an IPv4 Link-Local address on an interface is + based solely on the state of the interface, and is independent + of any other protocols such as DHCP. A host MUST NOT alter its + behavior and use of other protocols such as DHCP because the + host has assigned an IPv4 Link-Local address to an interface. + + + + +Cheshire, et al. Standards Track [Page 8] + +RFC 3927 IPv4 Link-Local May 2005 + + + 2. If a host finds that an interface that was previously + configured with an IPv4 Link-Local address now has an operable + routable address available, the host MUST use the routable + address when initiating new communications, and MUST cease + advertising the availability of the IPv4 Link-Local address + through whatever mechanisms that address had been made known to + others. The host SHOULD continue to use the IPv4 Link-Local + address for communications already underway, and MAY continue + to accept new communications addressed to the IPv4 Link-Local + address. Ways in which an operable routable address might + become available on an interface include: + + * Manual configuration + * Address assignment through DHCP + * Roaming of the host to a network on which a previously + assigned address becomes operable + + 3. If a host finds that an interface no longer has an operable + routable address available, the host MAY identify a usable IPv4 + Link-Local address (as described in section 2) and assign that + address to the interface. Ways in which an operable routable + address might cease to be available on an interface include: + + * Removal of the address from the interface through + manual configuration + * Expiration of the lease on the address assigned through + DHCP + * Roaming of the host to a new network on which the + address is no longer operable. + + The determination by the system of whether an address is "operable" + is not clear cut and many changes in the system context (e.g., + router changes) may affect the operability of an address. In + particular roaming of a host from one network to another is likely -- + but not certain -- to change the operability of a configured address + but detecting such a move is not always trivial. + + "Detection of Network Attachment (DNA) in IPv4" [DNAv4] provides + further discussion of address assignment and operability + determination. + +2. Address Selection, Defense and Delivery + + The following section explains the IPv4 Link-Local address selection + algorithm, how IPv4 Link-Local addresses are defended, and how IPv4 + packets with IPv4 Link-Local addresses are delivered. + + + + + +Cheshire, et al. Standards Track [Page 9] + +RFC 3927 IPv4 Link-Local May 2005 + + + Windows and Mac OS hosts that already implement Link-Local IPv4 + address auto-configuration are compatible with the rules presented in + this section. However, should any interoperability problem be + discovered, this document, not any prior implementation, defines the + standard. + +2.1. Link-Local Address Selection + + When a host wishes to configure an IPv4 Link-Local address, it + selects an address using a pseudo-random number generator with a + uniform distribution in the range from 169.254.1.0 to 169.254.254.255 + inclusive. + + The IPv4 prefix 169.254/16 is registered with the IANA for this + purpose. The first 256 and last 256 addresses in the 169.254/16 + prefix are reserved for future use and MUST NOT be selected by a host + using this dynamic configuration mechanism. + + The pseudo-random number generation algorithm MUST be chosen so that + different hosts do not generate the same sequence of numbers. If the + host has access to persistent information that is different for each + host, such as its IEEE 802 MAC address, then the pseudo-random number + generator SHOULD be seeded using a value derived from this + information. This means that even without using any other persistent + storage, a host will usually select the same IPv4 Link-Local address + each time it is booted, which can be convenient for debugging and + other operational reasons. Seeding the pseudo-random number + generator using the real-time clock or any other information which is + (or may be) identical in every host is NOT suitable for this purpose, + because a group of hosts that are all powered on at the same time + might then all generate the same sequence, resulting in a never- + ending series of conflicts as the hosts move in lock-step through + exactly the same pseudo-random sequence, conflicting on every address + they probe. + + Hosts that are equipped with persistent storage MAY, for each + interface, record the IPv4 address they have selected. On booting, + hosts with a previously recorded address SHOULD use that address as + their first candidate when probing. This increases the stability of + addresses. For example, if a group of hosts are powered off at + night, then when they are powered on the next morning they will all + resume using the same addresses, instead of picking different + addresses and potentially having to resolve conflicts that arise. + + + + + + + + +Cheshire, et al. Standards Track [Page 10] + +RFC 3927 IPv4 Link-Local May 2005 + + +2.2. Claiming a Link-Local Address + + After it has selected an IPv4 Link-Local address, a host MUST test to + see if the IPv4 Link-Local address is already in use before beginning + to use it. When a network interface transitions from an inactive to + an active state, the host does not have knowledge of what IPv4 Link- + Local addresses may currently be in use on that link, since the point + of attachment may have changed or the network interface may have been + inactive when a conflicting address was claimed. + + Were the host to immediately begin using an IPv4 Link-Local address + which is already in use by another host, this would be disruptive to + that other host. Since it is possible that the host has changed its + point of attachment, a routable address may be obtainable on the new + network, and therefore it cannot be assumed that an IPv4 Link-Local + address is to be preferred. + + Before using the IPv4 Link-Local address (e.g., using it as the + source address in an IPv4 packet, or as the Sender IPv4 address in an + ARP packet) a host MUST perform the probing test described below to + achieve better confidence that using the IPv4 Link-Local address will + not cause disruption. + + Examples of events that involve an interface becoming active include: + + Reboot/startup + Wake from sleep (if network interface was inactive during sleep) + Bringing up previously inactive network interface + IEEE 802 hardware link-state change (appropriate for the + media type and security mechanisms which apply) indicates + that an interface has become active. + Association with a wireless base station or ad hoc network. + + A host MUST NOT perform this check periodically as a matter of + course. This would be a waste of network bandwidth, and is + unnecessary due to the ability of hosts to passively discover + conflicts, as described in Section 2.5. + +2.2.1. Probe details + + On a link-layer such as IEEE 802 that supports ARP, conflict + detection is done using ARP probes. On link-layer technologies that + do not support ARP other techniques may be available for determining + whether a particular IPv4 address is currently in use. However, the + application of claim-and-defend mechanisms to such networks is + outside the scope of this document. + + + + + +Cheshire, et al. Standards Track [Page 11] + +RFC 3927 IPv4 Link-Local May 2005 + + + A host probes to see if an address is already in use by broadcasting + an ARP Request for the desired address. The client MUST fill in the + 'sender hardware address' field of the ARP Request with the hardware + address of the interface through which it is sending the packet. The + 'sender IP address' field MUST be set to all zeroes, to avoid + polluting ARP caches in other hosts on the same link in the case + where the address turns out to be already in use by another host. + The 'target hardware address' field is ignored and SHOULD be set to + all zeroes. The 'target IP address' field MUST be set to the address + being probed. An ARP Request constructed this way with an all-zero + 'sender IP address' is referred to as an "ARP Probe". + + When ready to begin probing, the host should then wait for a random + time interval selected uniformly in the range zero to PROBE_WAIT + seconds, and should then send PROBE_NUM probe packets, each of these + probe packets spaced randomly, PROBE_MIN to PROBE_MAX seconds apart. + If during this period, from the beginning of the probing process + until ANNOUNCE_WAIT seconds after the last probe packet is sent, the + host receives any ARP packet (Request *or* Reply) on the interface + where the probe is being performed where the packet's 'sender IP + address' is the address being probed for, then the host MUST treat + this address as being in use by some other host, and MUST select a + new pseudo-random address and repeat the process. In addition, if + during this period the host receives any ARP Probe where the packet's + 'target IP address' is the address being probed for, and the packet's + 'sender hardware address' is not the hardware address of the + interface the host is attempting to configure, then the host MUST + similarly treat this as an address conflict and select a new address + as above. This can occur if two (or more) hosts attempt to configure + the same IPv4 Link-Local address at the same time. + + A host should maintain a counter of the number of address conflicts + it has experienced in the process of trying to acquire an address, + and if the number of conflicts exceeds MAX_CONFLICTS then the host + MUST limit the rate at which it probes for new addresses to no more + than one new address per RATE_LIMIT_INTERVAL. This is to prevent + catastrophic ARP storms in pathological failure cases, such as a + rogue host that answers all ARP probes, causing legitimate hosts to + go into an infinite loop attempting to select a usable address. + + If, by ANNOUNCE_WAIT seconds after the transmission of the last ARP + Probe no conflicting ARP Reply or ARP Probe has been received, then + the host has successfully claimed the desired IPv4 Link-Local + address. + + + + + + + +Cheshire, et al. Standards Track [Page 12] + +RFC 3927 IPv4 Link-Local May 2005 + + +2.3. Shorter Timeouts + + Network technologies may emerge for which shorter delays are + appropriate than those required by this document. A subsequent IETF + publication may be produced providing guidelines for different values + for PROBE_WAIT, PROBE_NUM, PROBE_MIN and PROBE_MAX on those + technologies. + +2.4. Announcing an Address + + Having probed to determine a unique address to use, the host MUST + then announce its claimed address by broadcasting ANNOUNCE_NUM ARP + announcements, spaced ANNOUNCE_INTERVAL seconds apart. An ARP + announcement is identical to the ARP Probe described above, except + that now the sender and target IP addresses are both set to the + host's newly selected IPv4 address. The purpose of these ARP + announcements is to make sure that other hosts on the link do not + have stale ARP cache entries left over from some other host that may + previously have been using the same address. + +2.5. Conflict Detection and Defense + + Address conflict detection is not limited to the address selection + phase, when a host is sending ARP probes. Address conflict detection + is an ongoing process that is in effect for as long as a host is + using an IPv4 Link-Local address. At any time, if a host receives an + ARP packet (request *or* reply) on an interface where the 'sender IP + address' is the IP address the host has configured for that + interface, but the 'sender hardware address' does not match the + hardware address of that interface, then this is a conflicting ARP + packet, indicating an address conflict. + + A host MUST respond to a conflicting ARP packet as described in + either (a) or (b) below: + + (a) Upon receiving a conflicting ARP packet, a host MAY elect to + immediately configure a new IPv4 Link-Local address as described + above, or + + (b) If a host currently has active TCP connections or other reasons + to prefer to keep the same IPv4 address, and it has not seen any + other conflicting ARP packets within the last DEFEND_INTERVAL + seconds, then it MAY elect to attempt to defend its address by + recording the time that the conflicting ARP packet was received, and + then broadcasting one single ARP announcement, giving its own IP and + hardware addresses as the sender addresses of the ARP. Having done + this, the host can then continue to use the address normally without + any further special action. However, if this is not the first + + + +Cheshire, et al. Standards Track [Page 13] + +RFC 3927 IPv4 Link-Local May 2005 + + + conflicting ARP packet the host has seen, and the time recorded for + the previous conflicting ARP packet is recent, within DEFEND_INTERVAL + seconds, then the host MUST immediately cease using this address and + configure a new IPv4 Link-Local address as described above. This is + necessary to ensure that two hosts do not get stuck in an endless + loop with both hosts trying to defend the same address. + + A host MUST respond to conflicting ARP packets as described in either + (a) or (b) above. A host MUST NOT ignore conflicting ARP packets. + + Forced address reconfiguration may be disruptive, causing TCP + connections to be broken. However, it is expected that such + disruptions will be rare, and if inadvertent address duplication + happens, then disruption of communication is inevitable, no matter + how the addresses were assigned. It is not possible for two + different hosts using the same IP address on the same network to + operate reliably. + + Before abandoning an address due to a conflict, hosts SHOULD actively + attempt to reset any existing connections using that address. This + mitigates some security threats posed by address reconfiguration, as + discussed in Section 5. + + Immediately configuring a new address as soon as the conflict is + detected is the best way to restore useful communication as quickly + as possible. The mechanism described above of broadcasting a single + ARP announcement to defend the address mitigates the problem + somewhat, by helping to improve the chance that one of the two + conflicting hosts may be able to retain its address. + + All ARP packets (*replies* as well as requests) that contain a Link- + Local 'sender IP address' MUST be sent using link-layer broadcast + instead of link-layer unicast. This aids timely detection of + duplicate addresses. An example illustrating how this helps is given + in Section 4. + +2.6. Address Usage and Forwarding Rules + + A host implementing this specification has additional rules to + conform to, whether or not it has an interface configured with an + IPv4 Link-Local address. + +2.6.1. Source Address Usage + + Since each interface on a host may have an IPv4 Link-Local address in + addition to zero or more other addresses configured by other means + (e.g., manually or via a DHCP server), a host may have to make a + + + + +Cheshire, et al. Standards Track [Page 14] + +RFC 3927 IPv4 Link-Local May 2005 + + + choice about what source address to use when it sends a packet or + initiates a TCP connection. + + Where both an IPv4 Link-Local and a routable address are available on + the same interface, the routable address should be preferred as the + source address for new communications, but packets sent from or to + the IPv4 Link-Local address are still delivered as expected. The + IPv4 Link-Local address may continue to be used as a source address + in communications where switching to a preferred address would cause + communications failure because of the requirements of an upper-layer + protocol (e.g., an existing TCP connection). For more details, see + Section 1.7. + + A multi-homed host needs to select an outgoing interface whether or + not the destination is an IPv4 Link-Local address. Details of that + process are beyond the scope of this specification. After selecting + an interface, the multi-homed host should send packets involving IPv4 + Link-Local addresses as specified in this document, as if the + selected interface were the host's only interface. See Section 3 for + further discussion of multi-homed hosts. + +2.6.2. Forwarding Rules + + Whichever interface is used, if the destination address is in the + 169.254/16 prefix (excluding the address 169.254.255.255, which is + the broadcast address for the Link-Local prefix), then the sender + MUST ARP for the destination address and then send its packet + directly to the destination on the same physical link. This MUST be + done whether the interface is configured with a Link-Local or a + routable IPv4 address. + + In many network stacks, achieving this functionality may be as simple + as adding a routing table entry indicating that 169.254/16 is + directly reachable on the local link. This approach will not work + for routers or multi-homed hosts. Refer to section 3 for more + discussion of multi-homed hosts. + + The host MUST NOT send a packet with an IPv4 Link-Local destination + address to any router for forwarding. + + If the destination address is a unicast address outside the + 169.254/16 prefix, then the host SHOULD use an appropriate routable + IPv4 source address, if it can. If for any reason the host chooses + to send the packet with an IPv4 Link-Local source address (e.g., no + routable address is available on the selected interface), then it + MUST ARP for the destination address and then send its packet, with + + + + + +Cheshire, et al. Standards Track [Page 15] + +RFC 3927 IPv4 Link-Local May 2005 + + + an IPv4 Link-Local source address and a routable destination IPv4 + address, directly to its destination on the same physical link. The + host MUST NOT send the packet to any router for forwarding. + + In the case of a device with a single interface and only an Link- + Local IPv4 address, this requirement can be paraphrased as "ARP for + everything". + + In many network stacks, achieving this "ARP for everything" behavior + may be as simple as having no primary IP router configured, having + the primary IP router address configured to 0.0.0.0, or having the + primary IP router address set to be the same as the host's own Link- + Local IPv4 address. For suggested behavior in multi-homed hosts, see + Section 3. + +2.7. Link-Local Packets Are Not Forwarded + + A sensible default for applications which are sending from an IPv4 + Link-Local address is to explicitly set the IPv4 TTL to 1. This is + not appropriate in all cases as some applications may require that + the IPv4 TTL be set to other values. + + An IPv4 packet whose source and/or destination address is in the + 169.254/16 prefix MUST NOT be sent to any router for forwarding, and + any network device receiving such a packet MUST NOT forward it, + regardless of the TTL in the IPv4 header. Similarly, a router or + other host MUST NOT indiscriminately answer all ARP Requests for + addresses in the 169.254/16 prefix. A router may of course answer + ARP Requests for one or more IPv4 Link-Local address(es) that it has + legitimately claimed for its own use according to the claim-and- + defend protocol described in this document. + + This restriction also applies to multicast packets. IPv4 packets + with a Link-Local source address MUST NOT be forwarded outside the + local link even if they have a multicast destination address. + +2.8. Link-Local Packets are Local + + The non-forwarding rule means that hosts may assume that all + 169.254/16 destination addresses are "on-link" and directly + reachable. The 169.254/16 address prefix MUST NOT be subnetted. + This specification utilizes ARP-based address conflict detection, + which functions by broadcasting on the local subnet. Since such + broadcasts are not forwarded, were subnetting to be allowed then + address conflicts could remain undetected. + + + + + + +Cheshire, et al. Standards Track [Page 16] + +RFC 3927 IPv4 Link-Local May 2005 + + + This does not mean that Link-Local devices are forbidden from any + communication outside the local link. IP hosts that implement both + Link-Local and conventional routable IPv4 addresses may still use + their routable addresses without restriction as they do today. + +2.9. Higher-Layer Protocol Considerations + + Similar considerations apply at layers above IP. + + For example, designers of Web pages (including automatically + generated web pages) SHOULD NOT contain links with embedded IPv4 + Link-Local addresses if those pages are viewable from hosts outside + the local link where the addresses are valid. + + As IPv4 Link-Local addresses may change at any time and have limited + scope, IPv4 Link-Local addresses MUST NOT be stored in the DNS. + +2.10. Privacy Concerns + + Another reason to restrict leakage of IPv4 Link-Local addresses + outside the local link is privacy concerns. If IPv4 Link-Local + addresses are derived from a hash of the MAC address, some argue that + they could be indirectly associated with an individual, and thereby + used to track that individual's activities. Within the local link + the hardware addresses in the packets are all directly observable, so + as long as IPv4 Link-Local addresses don't leave the local link they + provide no more information to an intruder than could be gained by + direct observation of hardware addresses. + +2.11. Interaction between DHCPv4 client and IPv4 Link-Local State + Machines + + As documented in Appendix A, early implementations of IPv4 Link-Local + have modified the DHCP state machine. Field experience shows that + these modifications reduce the reliability of the DHCP service. + + A device that implements both IPv4 Link-Local and a DHCPv4 client + should not alter the behavior of the DHCPv4 client to accommodate + IPv4 Link-Local configuration. In particular configuration of an + IPv4 Link-Local address, whether or not a DHCP server is currently + responding, is not sufficient reason to unconfigure a valid DHCP + lease, to stop the DHCP client from attempting to acquire a new IP + address, to change DHCP timeouts or to change the behavior of the + DHCP state machine in any other way. + + Further discussion of this issue is provided in "Detection of Network + Attachment (DNA) in IPv4" [DNAv4]. + + + + +Cheshire, et al. Standards Track [Page 17] + +RFC 3927 IPv4 Link-Local May 2005 + + +3. Considerations for Multiple Interfaces + + The considerations outlined here also apply whenever a host has + multiple IP addresses, whether or not it has multiple physical + interfaces. Other examples of multiple interfaces include different + logical endpoints (tunnels, virtual private networks etc.) and + multiple logical networks on the same physical medium. This is often + referred to as "multi-homing". + + Hosts which have more than one active interface and elect to + implement dynamic configuration of IPv4 Link-Local addresses on one + or more of those interfaces will face various problems. This section + lists these problems but does no more than indicate how one might + solve them. At the time of this writing, there is no silver bullet + which solves these problems in all cases, in a general way. + Implementors must think through these issues before implementing the + protocol specified in this document on a system which may have more + than one active interface as part of a TCP/IP stack capable of + multi-homing. + +3.1. Scoped Addresses + + A host may be attached to more than one network at the same time. It + would be nice if there was a single address space used in every + network, but this is not the case. Addresses used in one network, be + it a network behind a NAT or a link on which IPv4 Link-Local + addresses are used, cannot be used in another network and have the + same effect. + + It would also be nice if addresses were not exposed to applications, + but they are. Most software using TCP/IP which await messages + receives from any interface at a particular port number, for a + particular transport protocol. Applications are generally only aware + (and care) that they have received a message. The application knows + the address of the sender to which the application will reply. + + The first scoped address problem is source address selection. A + multi-homed host has more than one address. Which address should be + used as the source address when sending to a particular destination? + This question is usually answered by referring to a routing table, + which expresses on which interface (with which address) to send, and + how to send (should one forward to a router, or send directly). The + choice is made complicated by scoped addresses because the address + range in which the destination lies may be ambiguous. The table may + not be able to yield a good answer. This problem is bound up with + next-hop selection, which is discussed in Section 3.2. + + + + + +Cheshire, et al. Standards Track [Page 18] + +RFC 3927 IPv4 Link-Local May 2005 + + + The second scoped address problem arises from scoped parameters + leaking outside their scope. This is discussed in Section 7. + + It is possible to overcome these problems. One way is to expose + scope information to applications such that they are always aware of + what scope a peer is in. This way, the correct interface could be + selected, and a safe procedure could be followed with respect to + forwarding addresses and other scoped parameters. There are other + possible approaches. None of these methods have been standardized + for IPv4 nor are they specified in this document. A good API design + could mitigate the problems, either by exposing address scopes to + 'scoped-address aware' applications or by cleverly encapsulating the + scoping information and logic so that applications do the right thing + without being aware of address scoping. + + An implementer could undertake to solve these problems, but cannot + simply ignore them. With sufficient experience, it is hoped that + specifications will emerge explaining how to overcome scoped address + multi-homing problems. + +3.2. Address Ambiguity + + This is a core problem with respect to IPv4 Link-Local destination + addresses being reachable on more than one interface. What should a + host do when it needs to send to Link-Local destination L and L can + be resolved using ARP on more than one link? + + Even if a Link-Local address can be resolved on only one link at a + given moment, there is no guarantee that it will remain unambiguous + in the future. Additional hosts on other interfaces may claim the + address L as well. + + One possibility is to support this only in the case where the + application specifically expresses which interface to send from. + + There is no standard or obvious solution to this problem. Existing + application software written for the IPv4 protocol suite is largely + incapable of dealing with address ambiguity. This does not preclude + an implementer from finding a solution, writing applications which + are able to use it, and providing a host which can support dynamic + configuration of IPv4 Link-Local addresses on more than one + interface. This solution will almost surely not be generally + applicable to existing software and transparent to higher layers, + however. + + Given that the IP stack must have the outbound interface associated + with a packet that needs to be sent to a Link-Local destination + address, interface selection must occur. The outbound interface + + + +Cheshire, et al. Standards Track [Page 19] + +RFC 3927 IPv4 Link-Local May 2005 + + + cannot be derived from the packet's header parameters such as source + or destination address (e.g., by using the forwarding table lookup). + Therefore, outbound interface association must be done explicitly + through other means. The specification does not stipulate those + means. + +3.3. Interaction with Hosts with Routable Addresses + + Attention is paid in this specification to transition from the use of + IPv4 Link-Local addresses to routable addresses (see Section 1.5). + The intention is to allow a host with a single interface to first + support Link-Local configuration then gracefully transition to the + use of a routable address. Since the host transitioning to the use + of a routable address may temporarily have more than one address + active, the scoped address issues described in Section 3.1 will + apply. When a host acquires a routable address, it does not need to + retain its Link-Local address for the purpose of communicating with + other devices on the link that are themselves using only Link-Local + addresses: any host conforming to this specification knows that + regardless of source address an IPv4 Link-Local destination must be + reached by forwarding directly to the destination, not via a router; + it is not necessary for that host to have a Link-Local source address + in order to send to a Link-Local destination address. + + A host with an IPv4 Link-Local address may send to a destination + which does not have an IPv4 Link-Local address. If the host is not + multi-homed, the procedure is simple and unambiguous: Using ARP and + forwarding directly to on-link destinations is the default route. If + the host is multi-homed, however, the routing policy is more complex, + especially if one of the interfaces is configured with a routable + address and the default route is (sensibly) directed at a router + accessible through that interface. The following example illustrates + this problem and provides a common solution to it. + + i1 +---------+ i2 i3 +-------+ + ROUTER-------= HOST1 =---------= HOST2 | + link1 +---------+ link2 +-------+ + + In the figure above, HOST1 is connected to link1 and link2. + Interface i1 is configured with a routable address, while i2 is an + IPv4 Link-Local address. HOST1 has its default route set to ROUTER's + address, through i1. HOST1 will route to destinations in 169.254/16 + to i2, sending directly to the destination. + + HOST2 has a configured (non-Link-Local) IPv4 address assigned to i3. + + + + + + +Cheshire, et al. Standards Track [Page 20] + +RFC 3927 IPv4 Link-Local May 2005 + + + Using a name resolution or service discovery protocol HOST1 can + discover HOST2's address. Since HOST2's address is not in + 169.254/16, HOST1's routing policy will send datagrams to HOST2 via + i1, to the ROUTER. Unless there is a route from ROUTER to HOST2, the + datagrams sent from HOST1 to HOST2 will not reach it. + + One solution to this problem is for a host to attempt to reach any + host locally (using ARP) for which it receives an unreachable ICMP + error message (ICMP message codes 0, 1, 6 or 7 [RFC792]). The host + tries all its attached links in a round robin fashion. This has been + implemented successfully for some IPv6 hosts, to circumvent exactly + this problem. In terms of this example, HOST1 upon failing to reach + HOST2 via the ROUTER, will attempt to forward to HOST2 via i2 and + succeed. + + It may also be possible to overcome this problem using techniques + described in section 3.2, or other means not discussed here. This + specification does not provide a standard solution, nor does it + preclude implementers from supporting multi-homed configurations, + provided that they address the concerns in this section for the + applications which will be supported on the host. + +3.4. Unintentional Autoimmune Response + + Care must be taken if a multi-homed host can support more than one + interface on the same link, all of which support IPv4 Link-Local + autoconfiguration. If these interfaces attempt to allocate the same + address, they will defend the host against itself -- causing the + claiming algorithm to fail. The simplest solution to this problem is + to run the algorithm independently on each interface configured with + IPv4 Link-Local addresses. + + In particular, ARP packets which appear to claim an address which is + assigned to a specific interface, indicate conflict only if they are + received on that interface and their hardware address is of some + other interface. + + If a host has two interfaces on the same link, then claiming and + defending on those interfaces must ensure that they end up with + different addresses just as if they were on different hosts. Note + that some of the ways a host may find itself with two interfaces on + the same link may be unexpected and non-obvious, such as when a host + has Ethernet and 802.11 wireless, but those two links are (possibly + even without the knowledge of the host's user) bridged together. + + + + + + + +Cheshire, et al. Standards Track [Page 21] + +RFC 3927 IPv4 Link-Local May 2005 + + +4. Healing of Network Partitions + + Hosts on disjoint network links may configure the same IPv4 Link- + Local address. If these separate network links are later joined or + bridged together, then there may be two hosts which are now on the + same link, trying to use the same address. When either host attempts + to communicate with any other host on the network, it will at some + point broadcast an ARP packet which will enable the hosts in question + to detect that there is an address conflict. + + When these address conflicts are detected, the subsequent forced + reconfiguration may be disruptive, causing TCP connections to be + broken. However, it is expected that such disruptions will be rare. + It should be relatively uncommon for networks to be joined while + hosts on those networks are active. Also, 65024 addresses are + available for IPv4 Link-Local use, so even when two small networks + are joined, the chance of conflict for any given host is fairly + small. + + When joining two large networks (defined as networks with a + substantial number of hosts per segment) there is a greater chance of + conflict. In such networks, it is likely that the joining of + previously separated segments will result in one or more hosts + needing to change their IPv4 Link-Local address, with subsequent loss + of TCP connections. In cases where separation and re-joining is + frequent, as in remotely bridged networks, this could prove + disruptive. However, unless the number of hosts on the joined + segments is very large, the traffic resulting from the join and + subsequent address conflict resolution will be small. + + Sending ARP replies that have IPv4 Link-Local sender addresses via + broadcast instead of unicast ensures that these conflicts can be + detected as soon as they become potential problems, but no sooner. + For example, if two disjoint network links are joined, where hosts A + and B have both configured the same Link-Local address, X, they can + remain in this state until A, B or some other host attempts to + initiate communication. If some other host C now sends an ARP + request for address X, and hosts A and B were to both reply with + conventional unicast ARP replies, then host C might be confused, but + A and B still wouldn't know there is a problem because neither would + have seen the other's packet. Sending these replies via broadcast + allows A and B to see each other's conflicting ARP packets and + respond accordingly. + + Note that sending periodic gratuitous ARPs in an attempt to detect + these conflicts sooner is not necessary, wastes network bandwidth, + and may actually be detrimental. For example, if the network links + were joined only briefly, and were separated again before any new + + + +Cheshire, et al. Standards Track [Page 22] + +RFC 3927 IPv4 Link-Local May 2005 + + + communication involving A or B were initiated, then the temporary + conflict would have been benign and no forced reconfiguration would + have been required. Triggering an unnecessary forced reconfiguration + in this case would not serve any useful purpose. Hosts SHOULD NOT + send periodic gratuitous ARPs. + +5. Security Considerations + + The use of IPv4 Link-Local Addresses may open a network host to new + attacks. In particular, a host that previously did not have an IP + address, and no IP stack running, was not susceptible to IP-based + attacks. By configuring a working address, the host may now be + vulnerable to IP-based attacks. + + The ARP protocol [RFC826] is insecure. A malicious host may send + fraudulent ARP packets on the network, interfering with the correct + operation of other hosts. For example, it is easy for a host to + answer all ARP requests with replies giving its own hardware address, + thereby claiming ownership of every address on the network. + + NOTE: There are certain kinds of local links, such as wireless LANs, + that provide no physical security. Because of the existence of these + links it would be very unwise for an implementer to assume that when + a device is communicating only on the local link it can dispense with + normal security precautions. Failure to implement appropriate + security measures could expose users to considerable risks. + + A host implementing IPv4 Link-Local configuration has an additional + vulnerability to selective reconfiguration and disruption. It is + possible for an on-link attacker to issue ARP packets which would + cause a host to break all its connections by switching to a new + address. The attacker could force the host implementing IPv4 Link- + Local configuration to select certain addresses, or prevent it from + ever completing address selection. This is a distinct threat from + that posed by spoofed ARPs, described in the preceding paragraph. + + Implementations and users should also note that a node that gives up + an address and reconfigures, as required by section 2.5, allows the + possibility that another node can easily and successfully hijack + existing TCP connections. + + Implementers are advised that the Internet Protocol architecture + expects every networked device or host must implement security which + is adequate to protect the resources to which the device or host has + access, including the network itself, against known or credible + threats. Even though use of IPv4 Link-Local addresses may reduce the + + + + + +Cheshire, et al. Standards Track [Page 23] + +RFC 3927 IPv4 Link-Local May 2005 + + + number of threats to which a device is exposed, implementers of + devices supporting the Internet Protocol must not assume that a + customer's local network is free from security risks. + + While there may be particular kinds of devices, or particular + environments, for which the security provided by the network is + adequate to protect the resources that are accessible by the device, + it would be misleading to make a general statement to the effect that + the requirement to provide security is reduced for devices using IPv4 + Link-Local addresses as a sole means of access. + + In all cases, whether or not IPv4 Link-Local addresses are used, it + is necessary for implementers of devices supporting the Internet + Protocol to analyze the known and credible threats to which a + specific host or device might be subjected, and to the extent that it + is feasible, to provide security mechanisms which ameliorate or + reduce the risks associated with such threats. + +6. Application Programming Considerations + + Use of IPv4 Link-Local autoconfigured addresses presents additional + challenges to writers of applications and may result in existing + application software failing. + +6.1. Address Changes, Failure and Recovery + + IPv4 Link-Local addresses used by an application may change over + time. Some application software encountering an address change will + fail. For example, existing client TCP connections will be aborted, + servers whose addresses change will have to be rediscovered, blocked + reads and writes will exit with an error condition, and so on. + + Vendors producing application software which will be used on IP + implementations supporting IPv4 Link-Local address configuration + SHOULD detect and cope with address change events. Vendors producing + IPv4 implementations supporting IPv4 Link-Local address configuration + SHOULD expose address change events to applications. + +6.2. Limited Forwarding of Locators + + IPv4 Link-Local addresses MUST NOT be forwarded via an application + protocol (for example in a URL), to a destination that is not on the + same link. This is discussed further in Sections 2.9 and 3. + + Existing distributed application software that forwards address + information may fail. For example, FTP [RFC959] (when not using + passive mode) transmits the IP address of the client. Suppose a + client starts up and obtains its IPv4 configuration at a time when it + + + +Cheshire, et al. Standards Track [Page 24] + +RFC 3927 IPv4 Link-Local May 2005 + + + has only a Link-Local address. Later, the host gets a global IP + address, and the client contacts an FTP server outside the local + link. If the FTP client transmits its old Link-Local address instead + of its new global IP address in the FTP "port" command, then the FTP + server will be unable to open a data connection back to the client, + and the FTP operation will fail. + +6.3. Address Ambiguity + + Application software run on a multi-homed host that supports IPv4 + Link-Local address configuration on more than one interface may fail. + + This is because application software assumes that an IPv4 address is + unambiguous, that it can refer to only one host. IPv4 Link-Local + addresses are unique only on a single link. A host attached to + multiple links can easily encounter a situation where the same + address is present on more than one interface, or first on one + interface, later on another; in any case associated with more than + one host. Most existing software is not prepared for this ambiguity. + In the future, application programming interfaces could be developed + to prevent this problem. This issue is discussed in Section 3. + +7. Router Considerations + + A router MUST NOT forward a packet with an IPv4 Link-Local source or + destination address, irrespective of the router's default route + configuration or routes obtained from dynamic routing protocols. + + A router which receives a packet with an IPv4 Link-Local source or + destination address MUST NOT forward the packet. This prevents + forwarding of packets back onto the network segment from which they + originated, or to any other segment. + +8. IANA Considerations + + The IANA has allocated the prefix 169.254/16 for the use described in + this document. The first and last 256 addresses in this range + (169.254.0.x and 169.254.255.x) are allocated by Standards Action, as + defined in "Guidelines for Writing an IANA" (BCP 26) [RFC2434]. No + other IANA services are required by this document. + + + + + + + + + + + +Cheshire, et al. Standards Track [Page 25] + +RFC 3927 IPv4 Link-Local May 2005 + + +9. Constants + + The following timing constants are used in this protocol; they are + not intended to be user configurable. + + PROBE_WAIT 1 second (initial random delay) + PROBE_NUM 3 (number of probe packets) + PROBE_MIN 1 second (minimum delay till repeated probe) + PROBE_MAX 2 seconds (maximum delay till repeated probe) + ANNOUNCE_WAIT 2 seconds (delay before announcing) + ANNOUNCE_NUM 2 (number of announcement packets) + ANNOUNCE_INTERVAL 2 seconds (time between announcement packets) + MAX_CONFLICTS 10 (max conflicts before rate limiting) + RATE_LIMIT_INTERVAL 60 seconds (delay between successive attempts) + DEFEND_INTERVAL 10 seconds (minimum interval between defensive + ARPs). + +10. References + +10.1. Normative References + + [RFC792] Postel, J., "Internet Control Message Protocol", STD 5, RFC + 792, September 1981. + + [RFC826] Plummer, D., "Ethernet Address Resolution Protocol: Or + converting network protocol addresses to 48.bit Ethernet + address for transmission on Ethernet hardware", STD 37, RFC + 826, November 1982. + + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate + Requirement Levels", BCP 14, RFC 2119, March 1997. + + [RFC2434] Narten, T. and H. Alvestrand, "Guidelines for Writing an + IANA Considerations Section in RFCs", BCP 26, RFC 2434, + October 1998. + +10.2. Informative References + + [802] IEEE Standards for Local and Metropolitan Area Networks: + Overview and Architecture, ANSI/IEEE Std 802, 1990. + + [802.3] ISO/IEC 8802-3 Information technology - Telecommunications + and information exchange between systems - Local and + metropolitan area networks - Common specifications - Part + 3: Carrier Sense Multiple Access with Collision Detection + (CSMA/CD) Access Method and Physical Layer Specifications, + (also ANSI/IEEE Std 802.3- 1996), 1996. + + + + +Cheshire, et al. Standards Track [Page 26] + +RFC 3927 IPv4 Link-Local May 2005 + + + [802.5] ISO/IEC 8802-5 Information technology - Telecommunications + and information exchange between systems - Local and + metropolitan area networks - Common specifications - Part + 5: Token ring access method and physical layer + specifications, (also ANSI/IEEE Std 802.5-1998), 1998. + + [802.11] Information technology - Telecommunications and information + exchange between systems - Local and metropolitan area + networks - Specific Requirements Part 11: Wireless LAN + Medium Access Control (MAC) and Physical Layer (PHY) + Specifications, IEEE Std. 802.11-1999, 1999. + + [RFC959] Postel, J. and J. Reynolds, "File Transfer Protocol", STD + 9, RFC 959, October 1985. + + [RFC1918] Rekhter, Y., Moskowitz, B., Karrenberg, D., de Groot, G., + and E. Lear, "Address Allocation for Private Internets", + BCP 5, RFC 1918, February 1996. + + [RFC2131] Droms, R., "Dynamic Host Configuration Protocol", RFC 2131, + March 1997. + + [RFC2462] Thomson, S. and T. Narten, "IPv6 Stateless Address + Autoconfiguration", RFC 2462, December 1998. + + [RFC3027] Holdrege, M. and P. Srisuresh, "Protocol Complications with + the IP Network Address Translator", RFC 3027, January 2001. + + [DNAv4] Aboba, B., "Detection of Network Attachment (DNA) in IPv4", + Work in Progress, July 2004. + + [LLMNR] Esibov, L., Aboba, B. and D. Thaler, "Linklocal Multicast + Name Resolution (LLMNR)", Work in Progress, June 2004. + +Acknowledgments + + We would like to thank (in alphabetical order) Jim Busse, Pavani + Diwanji, Donald Eastlake 3rd, Robert Elz, Peter Ford, Spencer + Giacalone, Josh Graessley, Brad Hards, Myron Hattig, Hugh Holbrook, + Christian Huitema, Richard Johnson, Kim Yong-Woon, Mika Liljeberg, + Rod Lopez, Keith Moore, Satish Mundra, Thomas Narten, Erik Nordmark, + Philip Nye, Howard Ridenour, Daniel Senie, Dieter Siegmund, Valery + Smyslov, and Ryan Troll for their contributions. + + + + + + + + +Cheshire, et al. Standards Track [Page 27] + +RFC 3927 IPv4 Link-Local May 2005 + + +Appendix A - Prior Implementations + +A.1. Apple Mac OS 8.x and 9.x. + + Mac OS chooses the IP address on a pseudo-random basis. The selected + address is saved in persistent storage for continued use after + reboot, when possible. + + Mac OS sends nine DHCPDISCOVER packets, with an interval of two + seconds between packets. If no response is received from any of + these requests (18 seconds), it will autoconfigure. + + Upon finding that a selected address is in use, Mac OS will select a + new random address and try again, at a rate limited to no more than + one attempt every two seconds. + + Autoconfigured Mac OS systems check for the presence of a DHCP server + every five minutes. If a DHCP server is found but Mac OS is not + successful in obtaining a new lease, it keeps the existing + autoconfigured IP address. If Mac OS is successful at obtaining a + new lease, it drops all existing connections without warning. This + may cause users to lose sessions in progress. Once a new lease is + obtained, Mac OS will not allocate further connections using the + autoconfigured IP address. + + Mac OS systems do not send packets addressed to a Link-Local address + to the default gateway if one is present; these addresses are always + resolved on the local segment. + + Mac OS systems by default send all outgoing unicast packets with a + TTL of 255. All multicast and broadcast packets are also sent with a + TTL of 255 if they have a source address in the 169.254/16 prefix. + + Mac OS implements media sense where the hardware (and driver + software) supports this. As soon as network connectivity is + detected, a DHCPDISCOVER will be sent on the interface. This means + that systems will immediately transition out of autoconfigured mode + as soon as connectivity is restored. + +A.2. Apple Mac OS X Version 10.2 + + Mac OS X chooses the IP address on a pseudo-random basis. The + selected address is saved in memory so that it can be re-used during + subsequent autoconfiguration attempts during a single boot of the + system. + + + + + + +Cheshire, et al. Standards Track [Page 28] + +RFC 3927 IPv4 Link-Local May 2005 + + + Autoconfiguration of a Link-Local address depends on the results of + the DHCP process. DHCP sends two packets, with timeouts of one and + two seconds. If no response is received (three seconds), it begins + autoconfiguration. DHCP continues sending packets in parallel for a + total time of 60 seconds. + + At the start of autoconfiguration, it generates 10 unique random IP + addresses, and probes each one in turn for 2 seconds. It stops + probing after finding an address that is not in use, or the list of + addresses is exhausted. + + If DHCP is not successful, it waits five minutes before starting over + again. Once DHCP is successful, the autoconfigured Link-Local + address is given up. The Link-Local subnet, however, remains + configured. + + Autoconfiguration is only attempted on a single interface at any + given moment in time. + + Mac OS X ensures that the connected interface with the highest + priority is associated with the Link-Local subnet. Packets addressed + to a Link-Local address are never sent to the default gateway, if one + is present. Link-local addresses are always resolved on the local + segment. + + Mac OS X implements media sense where the hardware and driver support + it. When the network media indicates that it has been connected, the + autoconfiguration process begins again, and attempts to re-use the + previously assigned Link-Local address. When the network media + indicates that it has been disconnected, the system waits four + seconds before de-configuring the Link-Local address and subnet. If + the connection is restored before that time, the autoconfiguration + process begins again. If the connection is not restored before that + time, the system chooses another interface to autoconfigure. + + Mac OS X by default sends all outgoing unicast packets with a TTL of + 255. All multicast and broadcast packets are also sent with a TTL of + 255 if they have a source address in the 169.254/16 prefix. + +A.3. Microsoft Windows 98/98SE + + Windows 98/98SE systems choose their IPv4 Link-Local address on a + pseudo-random basis. The address selection algorithm is based on + computing a hash on the interface's MAC address, so that a large + collection of hosts should obey the uniform probability distribution + in choosing addresses within the 169.254/16 address space. Deriving + + + + + +Cheshire, et al. Standards Track [Page 29] + +RFC 3927 IPv4 Link-Local May 2005 + + + the initial IPv4 Link-Local address from the interface's MAC address + also ensures that systems rebooting will obtain the same + autoconfigured address, unless a conflict is detected. + + When in INIT state, the Windows 98/98SE DHCP Client sends out a total + of 4 DHCPDISCOVERs, with an inter-packet interval of 6 seconds. When + no response is received after all 4 packets (24 seconds), it will + autoconfigure an address. + + The autoconfigure retry count for Windows 98/98SE systems is 10. + After trying 10 autoconfigured IPv4 addresses, and finding all are + taken, the host will boot without an IPv4 address. + + Autoconfigured Windows 98/98SE systems check for the presence of a + DHCP server every five minutes. If a DHCP server is found but + Windows 98 is not successful in obtaining a new lease, it keeps the + existing autoconfigured IPv4 Link-Local address. If Windows 98/98SE + is successful at obtaining a new lease, it drops all existing + connections without warning. This may cause users to lose sessions + in progress. Once a new lease is obtained, Windows 98/98SE will not + allocate further connections using the autoconfigured IPv4 Link-Local + address. + + Windows 98/98SE systems with an IPv4 Link-Local address do not send + packets addressed to an IPv4 Link-Local address to the default + gateway if one is present; these addresses are always resolved on the + local segment. + + Windows 98/98SE systems by default send all outgoing unicast packets + with a TTL of 128. TTL configuration is performed by setting the + Windows Registry Key + HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services:\Tcpip\ + Parameters\DefaultTTL of type REG_DWORD to the appropriate value. + However, this default TTL will apply to all packets. While this + facility could be used to set the default TTL to 255, it cannot be + used to set the default TTL of IPv4 Link-Local packets to one (1), + while allowing other packets to be sent with a TTL larger than one. + + Windows 98/98SE systems do not implement media sense. This means + that network connectivity issues (such as a loose cable) may prevent + a system from contacting the DHCP server, thereby causing it to + auto-configure. When the connectivity problem is fixed (such as when + the cable is re-connected) the situation will not immediately correct + itself. Since the system will not sense the re-connection, it will + remain in autoconfigured mode until an attempt is made to reach the + DHCP server. + + + + + +Cheshire, et al. Standards Track [Page 30] + +RFC 3927 IPv4 Link-Local May 2005 + + + The DHCP server included with Windows 98SE Internet Connection + Sharing (ICS) (a NAT implementation) allocates out of the 192.168/16 + private address space by default. + + However, it is possible to change the allocation prefix via a + registry key, and no checks are made to prevent allocation out of the + IPv4 Link-Local prefix. When configured to do so, Windows 98SE ICS + will rewrite packets from the IPv4 Link-Local prefix and forward them + beyond the local link. Windows 98SE ICS does not automatically route + for the IPv4 Link-Local prefix, so that hosts obtaining addresses via + DHCP cannot communicate with autoconfigured-only devices. + + Other home gateways exist that allocate addresses out of the IPv4 + Link-Local prefix by default. Windows 98/98SE systems can use a + 169.254/16 IPv4 Link-Local address as the source address when + communicating with non-Link-Local hosts. Windows 98/98SE does not + support router solicitation/advertisement. Windows 98/98SE systems + will not automatically discover a default gateway when in + autoconfigured mode. + +A.4. Windows XP, 2000, and ME + + The autoconfiguration behavior of Windows XP, Windows 2000, and + Windows ME systems is identical to Windows 98/98SE except in the + following respects: + + Media Sense + Router Discovery + Silent RIP + + Windows XP, 2000, and ME implement media sense. As soon as network + connectivity is detected, a DHCPREQUEST or DHCPDISCOVER will be sent + on the interface. This means that systems will immediately + transition out of autoconfigured mode as soon as connectivity is + restored. + + Windows XP, 2000, and ME also support router discovery, although it + is turned off by default. Windows XP and 2000 also support a RIP + listener. This means that they may inadvertently discover a default + gateway while in autoconfigured mode. + + ICS on Windows XP/2000/ME behaves identically to Windows 98SE with + respect to address allocation and NATing of Link-Local prefixes. + + + + + + + + +Cheshire, et al. Standards Track [Page 31] + +RFC 3927 IPv4 Link-Local May 2005 + + +Authors' Addresses + + Stuart Cheshire + Apple Computer, Inc. + 1 Infinite Loop + Cupertino + California 95014, USA + + Phone: +1 408 974 3207 + EMail: rfc@stuartcheshire.org + + + Bernard Aboba + Microsoft Corporation + One Microsoft Way + Redmond, WA 98052 + + Phone: +1 425 818 4011 + EMail: bernarda@microsoft.com + + + Erik Guttman + Sun Microsystems + Eichhoelzelstr. 7 + 74915 Waibstadt Germany + + Phone: +49 7263 911 701 + EMail: erik@spybeam.org + + + + + + + + + + + + + + + + + + + + + + + +Cheshire, et al. Standards Track [Page 32] + +RFC 3927 IPv4 Link-Local May 2005 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2005). + + This document is subject to the rights, licenses and restrictions + contained in BCP 78, and except as set forth therein, the authors + retain all their rights. + + This document and the information contained herein are provided on an + "AS IS" basis and THE CONTRIBUTOR, THE ORGANIZATION HE/SHE REPRESENTS + OR IS SPONSORED BY (IF ANY), THE INTERNET SOCIETY AND THE INTERNET + ENGINEERING TASK FORCE DISCLAIM ALL WARRANTIES, EXPRESS OR IMPLIED, + INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE + INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED + WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Intellectual Property + + The IETF takes no position regarding the validity or scope of any + Intellectual Property Rights or other rights that might be claimed to + pertain to the implementation or use of the technology described in + this document or the extent to which any license under such rights + might or might not be available; nor does it represent that it has + made any independent effort to identify any such rights. Information + on the procedures with respect to rights in RFC documents can be + found in BCP 78 and BCP 79. + + Copies of IPR disclosures made to the IETF Secretariat and any + assurances of licenses to be made available, or the result of an + attempt made to obtain a general license or permission for the use of + such proprietary rights by implementers or users of this + specification can be obtained from the IETF on-line IPR repository at + http://www.ietf.org/ipr. + + The IETF invites any interested party to bring to its attention any + copyrights, patents or patent applications, or other proprietary + rights that may cover technology that may be required to implement + this standard. Please address the information to the IETF at ietf- + ipr@ietf.org. + +Acknowledgement + + Funding for the RFC Editor function is currently provided by the + Internet Society. + + + + + + + +Cheshire, et al. Standards Track [Page 33] + diff --git a/ext/picotcp/RFC/rfc4614.txt b/ext/picotcp/RFC/rfc4614.txt new file mode 100644 index 0000000..80d08af --- /dev/null +++ b/ext/picotcp/RFC/rfc4614.txt @@ -0,0 +1,1851 @@ + + + + + + +Network Working Group M. Duke +Request for Comments: 4614 Boeing Phantom Works +Category: Informational R. Braden + USC Information Sciences Institute + W. Eddy + Verizon Federal Network Systems + E. Blanton + Purdue University Computer Science + September 2006 + + + A Roadmap for Transmission Control Protocol (TCP) + Specification Documents + +Status of This Memo + + This memo provides information for the Internet community. It does + not specify an Internet standard of any kind. Distribution of this + memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2006). + +Abstract + + This document contains a "roadmap" to the Requests for Comments (RFC) + documents relating to the Internet's Transmission Control Protocol + (TCP). This roadmap provides a brief summary of the documents + defining TCP and various TCP extensions that have accumulated in the + RFC series. This serves as a guide and quick reference for both TCP + implementers and other parties who desire information contained in + the TCP-related RFCs. + + + + + + + + + + + + + + + + + + +Duke, et al. Informational [Page 1] + +RFC 4614 TCP Roadmap September 2006 + + +Table of Contents + + 1. Introduction ....................................................2 + 2. Basic Functionality .............................................4 + 3. Recommended Enhancements ........................................6 + 3.1. Congestion Control and Loss Recovery Extensions ............7 + 3.2. SACK-Based Loss Recovery and Congestion Control ............8 + 3.3. Dealing with Forged Segments ...............................9 + 4. Experimental Extensions ........................................10 + 5. Historic Extensions ............................................13 + 6. Support Documents ..............................................14 + 6.1. Foundational Works ........................................15 + 6.2. Difficult Network Environments ............................16 + 6.3. Implementation Advice .....................................19 + 6.4. Management Information Bases ..............................20 + 6.5. Tools and Tutorials .......................................22 + 6.6. Case Studies ..............................................22 + 7. Undocumented TCP Features ......................................23 + 8. Security Considerations ........................................24 + 9. Acknowledgments ................................................24 + 10. Informative References ........................................25 + 10.1. Basic Functionality ......................................25 + 10.2. Recommended Enhancements .................................25 + 10.3. Experimental Extensions ..................................26 + 10.4. Historic Extensions ......................................27 + 10.5. Support Documents ........................................28 + 10.6. Informative References Outside the RFC Series ............31 + +1. Introduction + + A correct and efficient implementation of the Transmission Control + Protocol (TCP) is a critical part of the software of most Internet + hosts. As TCP has evolved over the years, many distinct documents + have become part of the accepted standard for TCP. At the same time, + a large number of more experimental modifications to TCP have also + been published in the RFC series, along with informational notes, + case studies, and other advice. + + As an introduction to newcomers and an attempt to organize the + plethora of information for old hands, this document contains a + "roadmap" to the TCP-related RFCs. It provides a brief summary of + the RFC documents that define TCP. This should provide guidance to + implementers on the relevance and significance of the standards-track + extensions, informational notes, and best current practices that + relate to TCP. + + + + + + +Duke, et al. Informational [Page 2] + +RFC 4614 TCP Roadmap September 2006 + + + This document is not an update of RFC 1122 and is not a rigorous + standard for what needs to be implemented in TCP. This document is + merely an informational roadmap that captures, organizes, and + summarizes most of the RFC documents that a TCP implementer, + experimenter, or student should be aware of. Particular comments or + broad categorizations that this document makes about individual + mechanisms and behaviors are not to be taken as definitive, nor + should the content of this document alone influence implementation + decisions. + + This roadmap includes a brief description of the contents of each + TCP-related RFC. In some cases, we simply supply the abstract or a + key summary sentence from the text as a terse description. In + addition, a letter code after an RFC number indicates its category in + the RFC series (see BCP 9 [RFC2026] for explanation of these + categories): + + S - Standards Track (Proposed Standard, Draft Standard, or + Standard) + + E - Experimental + + B - Best Current Practice + + I - Informational + + Note that the category of an RFC does not necessarily reflect its + current relevance. For instance, RFC 2581 is nearly universally + deployed although it is only a Proposed Standard. Similarly, some + Informational RFCs contain significant technical proposals for + changing TCP. + + This roadmap is divided into four main sections. Section 2 lists the + RFCs that describe absolutely required TCP behaviors for proper + functioning and interoperability. Further RFCs that describe + strongly encouraged, but non-essential, behaviors are listed in + Section 3. Experimental extensions that are not yet standard + practices, but that potentially could be in the future, are described + in Section 4. + + The reader will probably notice that these three sections are broadly + equivalent to MUST/SHOULD/MAY specifications (per RFC 2119), and + although the authors support this intuition, this document is merely + descriptive; it does not represent a binding standards-track + position. Individual implementers still need to examine the + standards documents themselves to evaluate specific requirement + levels. + + + + +Duke, et al. Informational [Page 3] + +RFC 4614 TCP Roadmap September 2006 + + + A small number of older experimental extensions that have not been + widely implemented, deployed, and used are noted in Section 5. Many + other supporting documents that are relevant to the development, + implementation, and deployment of TCP are described in Section 6. + Within each section, RFCs are listed in the chronological order of + their publication dates. + + A small number of fairly ubiquitous important implementation + practices that are not currently documented in the RFC series are + listed in Section 7. + +2. Basic Functionality + + A small number of documents compose the core specification of TCP. + These define the required basic functionalities of TCP's header + parsing, state machine, congestion control, and retransmission + timeout computation. These base specifications must be correctly + followed for interoperability. + + RFC 793 S: "Transmission Control Protocol", STD 7 (September 1981) + + This is the fundamental TCP specification document [RFC0793]. + Written by Jon Postel as part of the Internet protocol suite's + core, it describes the TCP packet format, the TCP state machine + and event processing, and TCP's semantics for data transmission, + reliability, flow control, multiplexing, and acknowledgment. + + Section 3.6 of RFC 793, describing TCP's handling of the IP + precedence and security compartment, is mostly irrelevant today. + RFC 2873 changed the IP precedence handling, and the security + compartment portion of the API is no longer implemented or used. + In addition, RFC 793 did not describe any congestion control + mechanism. Otherwise, however, the majority of this document + still accurately describes modern TCPs. RFC 793 is the last of a + series of developmental TCP specifications, starting in the + Internet Experimental Notes (IENs) and continuing in the RFC + series. + + RFC 1122 S: "Requirements for Internet Hosts - Communication Layers" + (October 1989) + + This document [RFC1122] updates and clarifies RFC 793, fixing some + specification bugs and oversights. It also explains some features + such as keep-alives and Karn's and Jacobson's RTO estimation + algorithms [KP87][Jac88][JK92]. ICMP interactions are mentioned, + and some tips are given for efficient implementation. RFC 1122 is + an Applicability Statement, listing the various features that + MUST, SHOULD, MAY, SHOULD NOT, and MUST NOT be present in + + + +Duke, et al. Informational [Page 4] + +RFC 4614 TCP Roadmap September 2006 + + + standards-conforming TCP implementations. Unlike a purely + informational "roadmap", this Applicability Statement is a + standards document and gives formal rules for implementation. + + RFC 2460 S: "Internet Protocol, Version 6 (IPv6) Specification + (December 1998) + + This document [RFC2460] is of relevance to TCP because it defines + how the pseudo-header for TCP's checksum computation is derived + when 128-bit IPv6 addresses are used instead of 32-bit IPv4 + addresses. Additionally, RFC 2675 describes TCP changes required + to support IPv6 jumbograms. + + RFC 2581 S: "TCP Congestion Control" (April 1999) + + Although RFC 793 did not contain any congestion control + mechanisms, today congestion control is a required component of + TCP implementations. This document [RFC2581] defines the current + versions of Van Jacobson's congestion avoidance and control + mechanisms for TCP, based on his 1988 SIGCOMM paper [Jac88]. RFC + 2001 was a conceptual precursor that was obsoleted by RFC 2581. + + A number of behaviors that together constitute what the community + refers to as "Reno TCP" are described in RFC 2581. The name + "Reno" comes from the Net/2 release of the 4.3 BSD operating + system. This is generally regarded as the least common + denominator among TCP flavors currently found running on Internet + hosts. Reno TCP includes the congestion control features of slow + start, congestion avoidance, fast retransmit, and fast recovery. + + RFC 1122 mandates the implementation of a congestion control + mechanism, and RFC 2581 details the currently accepted mechanism. + RFC 2581 differs slightly from the other documents listed in this + section, as it does not affect the ability of two TCP endpoints to + communicate; however, congestion control remains a critical + component of any widely deployed TCP implementation and is + required for the avoidance of congestion collapse and to ensure + fairness among competing flows. + + RFC 2873 S: "TCP Processing of the IPv4 Precedence Field" (June 2000) + + This document [RFC2873] removes from the TCP specification all + processing of the precedence bits of the TOS byte of the IP + header. This resolves a conflict over the use of these bits + between RFC 793 and Differentiated Services [RFC2474]. + + + + + + +Duke, et al. Informational [Page 5] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 2988 S: "Computing TCP's Retransmission Timer" (November 2000) + + Abstract: "This document defines the standard algorithm that + Transmission Control Protocol (TCP) senders are required to use to + compute and manage their retransmission timer. It expands on the + discussion in section 4.2.3.1 of RFC 1122 and upgrades the + requirement of supporting the algorithm from a SHOULD to a MUST." + [RFC2988] + +3. Recommended Enhancements + + This section describes recommended TCP modifications that improve + performance and security. RFCs 1323 and 3168 represent fundamental + changes to the protocol. RFC 1323, based on RFCs 1072 and 1185, + allows better utilization of high bandwidth-delay product paths by + providing some needed mechanisms for high-rate transfers. RFC 3168 + describes a change to the Internet's architecture, whereby routers + signal end-hosts of growing congestion levels and can do so before + packet losses are forced. Section 3.1 lists improvements in the + congestion control and loss recovery mechanisms specified in RFC + 2581. Section 3.2 describes further refinements that make use of + selective acknowledgments. Section 3.3 deals with the problem of + preventing forged segments. + + RFC 1323 S: "TCP Extensions for High Performance" (May 1992) + + This document [RFC1323] defines TCP extensions for window scaling, + timestamps, and protection against wrapped sequence numbers, for + efficient and safe operation over paths with large bandwidth-delay + products. These extensions are commonly found in currently used + systems; however, they may require manual tuning and + configuration. One issue in this specification that is still + under discussion concerns a modification to the algorithm for + estimating the mean RTT when timestamps are used. + + RFC 2675 S: "IPv6 Jumbograms" (August 1999) + + IPv6 supports longer datagrams than were allowed in IPv4. These + are known as Jumbograms, and use with TCP has necessitated changes + to the handling of TCP's MSS and Urgent fields (both 16 bits). + This document [RFC2675] explains those changes. Although it + describes changes to basic header semantics, these changes should + only affect the use of very large segments, such as IPv6 + jumbograms, which are currently rarely used in the general + Internet. Supporting the behavior described in this document does + not affect interoperability with other TCP implementations when + IPv4 or non-jumbogram IPv6 is used. This document states that + jumbograms are to only be used when it can be guaranteed that all + + + +Duke, et al. Informational [Page 6] + +RFC 4614 TCP Roadmap September 2006 + + + receiving nodes, including each router in the end-to-end path, + will support jumbograms. If even a single node that does not + support jumbograms is attached to a local network, then no host on + that network may use jumbograms. This explains why jumbogram use + has been rare, and why this document is considered a performance + optimization and not part of TCP over IPv6's basic functionality. + + RFC 3168 S: "The Addition of Explicit Congestion Notification (ECN) + to IP" (September 2001) + + This document [RFC3168] defines a means for end hosts to detect + congestion before congested routers are forced to discard packets. + Although congestion notification takes place at the IP level, ECN + requires support at the transport level (e.g., in TCP) to echo the + bits and adapt the sending rate. This document updates RFC 793 to + define two previously unused flag bits in the TCP header for ECN + support. RFC 3540 provides a supplementary (experimental) means + for more secure use of ECN, and RFC 2884 provides some sample + results from using ECN. + +3.1. Congestion Control and Loss Recovery Extensions + + Two of the most important aspects of TCP are its congestion control + and loss recovery features. TCP traditionally treats lost packets as + indicating congestion-related loss, and cannot distinguish between + congestion-related loss and loss due to transmission errors. Even + when ECN is in use, there is a rather intimate coupling between + congestion control and loss recovery mechanisms. There are several + extensions to both features, and more often than not, a particular + extension applies to both. In this sub-section, we group + enhancements to either congestion control, loss recovery, or both, + which can be performed unilaterally; that is, without negotiating + support between endpoints. In the next sub-section, we group the + extensions that specify or rely on the SACK option, which must be + negotiated bilaterally. TCP implementations should include the + enhancements from both sub-sections so that TCP senders can perform + well without regard to the feature sets of other hosts they connect + to. For example, if SACK use is not successfully negotiated, a host + should use the NewReno behavior as a fall back. + + + + + + + + + + + + +Duke, et al. Informational [Page 7] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 3042 S: "Enhancing TCP's Loss Recovery Using Limited Transmit" + (January 2001) + + Abstract: "This document proposes Limited Transmit, a new + Transmission Control Protocol (TCP) mechanism that can be used to + more effectively recover lost segments when a connection's + congestion window is small, or when a large number of segments are + lost in a single transmission window." [RFC3042] Tests from 2004 + showed that Limited Transmit was deployed in roughly one third of + the web servers tested [MAF04]. + + RFC 3390 S: "Increasing TCP's Initial Window" (October 2002) + + This document [RFC3390] updates RFC 2581 to permit an initial TCP + window of three or four segments during the slow-start phase, + depending on the segment size. + + RFC 3782 S: "The NewReno Modification to TCP's Fast Recovery + Algorithm" (April 2004) + + This document [RFC3782] specifies a modification to the standard + Reno fast recovery algorithm, whereby a TCP sender can use partial + acknowledgments to make inferences determining the next segment to + send in situations where SACK would be helpful but isn't + available. Although it is only a slight modification, the NewReno + behavior can make a significant difference in performance when + multiple segments are lost from a single window of data. + +3.2. SACK-Based Loss Recovery and Congestion Control + + The base TCP specification in RFC 793 provided only a simple + cumulative acknowledgment mechanism. However, a selective + acknowledgment (SACK) mechanism provides performance improvement in + the presence of multiple packet losses from the same flight, more + than outweighing the modest increase in complexity. A TCP should be + expected to implement SACK; however, SACK is a negotiated option and + is only used if support is advertised by both sides of a connection. + + RFC 2018 S: "TCP Selective Acknowledgment Options" (October 1996) + + This document [RFC2018] defines the basic selective acknowledgment + (SACK) mechanism for TCP. + + RFC 2883 S: "An Extension to the Selective Acknowledgement (SACK) + Option for TCP" (July 2000) + + This document [RFC2883] extends RFC 2018 to cover the case of + acknowledging duplicate segments. + + + +Duke, et al. Informational [Page 8] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 3517 S: "A Conservative Selective Acknowledgment (SACK)-based + Loss Recovery Algorithm for TCP" (April 2003) + + This document [RFC3517] describes a relatively sophisticated + algorithm that a TCP sender can use for loss recovery when SACK + reports more than one segment lost from a single flight of data. + Although support for the exchange of SACK information is widely + implemented, not all implementations use an algorithm as + sophisticated as that described in RFC 3517. + +3.3. Dealing with Forged Segments + + By default, TCP lacks any cryptographic structures to differentiate + legitimate segments and those spoofed from malicious hosts. Spoofing + valid segments requires correctly guessing a number of fields. The + documents in this sub-section describe ways to make that guessing + harder, or to prevent it from being able to affect a connection + negatively. + + The TCPM working group is currently in progress towards fully + understanding and defining mechanisms for preventing spoofing attacks + (including both spoofed TCP segments and ICMP datagrams). Some of + the solutions being considered rely on TCP modifications, whereas + others rely on security at lower layers (like IPsec) for protection. + + RFC 1948 I: "Defending Against Sequence Number Attacks" (May 1996) + + This document [RFC1948] describes the TCP vulnerability that + allows an attacker to send forged TCP packets, by guessing the + initial sequence number in the three-way handshake. Simple + defenses against exploitation are then described. Some variation + is implemented in most currently used operating systems. + + RFC 2385 S: "Protection of BGP Sessions via the TCP MD5 Signature + Option" (August 1998) + + From document: "This document describes current existing practice + for securing BGP against certain simple attacks. It is understood + to have security weaknesses against concerted attacks. + + This memo describes a TCP extension to enhance security for BGP. + It defines a new TCP option for carrying an MD5 digest in a TCP + segment. This digest acts like a signature for that segment, + incorporating information known only to the connection end points. + Since BGP uses TCP as its transport, using this option in the way + described in this paper significantly reduces the danger from + certain security attacks on BGP." [RFC2385] + + + + +Duke, et al. Informational [Page 9] + +RFC 4614 TCP Roadmap September 2006 + + + TCP MD5 options are currently only used in very limited contexts, + primarily for defending BGP exchanges between routers. Some + deployment notes for those using TCP MD5 are found in the later + RFC 3562, "Key Management Considerations for the TCP MD5 Signature + Option" [RFC3562]. RFC 4278 deprecates the use of TCP MD5 outside + BGP [RFC4278]. + +4. Experimental Extensions + + The RFCs in this section are still experimental, but they may become + proposed standards in the future. At least part of the reason that + they are still experimental is to gain more wide-scale experience + with them before a standards track decision is made. By their + publication as experimental RFCs, it is hoped that the community of + TCP researchers will analyze and test the contents of these RFCs. + Although experimentation is encouraged, there is not yet formal + consensus that these are fully logical and safe behaviors. Wide- + scale deployment of implementations that use these features should be + well thought-out in terms of consequences. + + RFC 2140 I: "TCP Control Block Interdependence" (April 1997) + + This document [RFC2140] suggests how TCP connections between the + same endpoints might share information, such as their congestion + control state. To some degree, this is done in practice by a few + operating systems; for example, Linux currently has a destination + cache. Although this RFC is technically informational, the + concepts it describes are in experimental use, so we include it in + this section. + + A related proposal, the Congestion Manager, is specified in RFC + 3124 [RFC3124]. The idea behind the Congestion Manager, moving + congestion control outside of individual TCP connections, + represents a modification to the core of TCP, which supports + sharing information among TCP connections as well. Although a + Proposed Standard, some pieces of the Congestion Manager support + architecture have not been specified yet, and it has not achieved + use or implementation beyond experimental stacks, so it is not + listed among the standard TCP enhancements in this roadmap. + + RFC 2861 E: "TCP Congestion Window Validation" (June 2000) + + This document [RFC2861] suggests reducing the congestion window + over time when no packets are flowing. This behavior is more + aggressive than that specified in RFC 2581, which says that a TCP + sender SHOULD set its congestion window to the initial window + after an idle period of an RTO or greater. + + + + +Duke, et al. Informational [Page 10] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 3465 E: "TCP Congestion Control with Appropriate Byte Counting + (ABC)" (February 2003) + + This document [RFC3465] suggests that congestion control use the + number of bytes acknowledged instead of the number of + acknowledgments received. This has been implemented in Linux. + The ABC mechanism behaves differently from the standard method + when there is not a one-to-one relationship between data segments + and acknowledgments. ABC still operates within the accepted + guidelines, but is more robust to delayed ACKs and ACK-division + [SCWA99][RFC3449]. + + RFC 3522 E: "The Eifel Detection Algorithm for TCP" (April 2003) + + The Eifel detection algorithm [RFC3522] allows a TCP sender to + detect a posteriori whether it has entered loss recovery + unnecessarily. + + RFC 3540 E: "Robust Explicit Congestion Notification (ECN) signaling + with Nonces" (June 2003) + + This document [RFC3540] suggests a modified ECN to address + security concerns and updates RFC 3168. + + RFC 3649 E: "HighSpeed TCP for Large Congestion Windows" (December + 2003) + + This document [RFC3649] suggests a modification to TCP's steady- + state behavior to use very large windows efficiently. + + RFC 3708 E: "Using TCP Duplicate Selective Acknowledgement (DSACKs) + and Stream Control Transmission Protocol (SCTP) Duplicate + Transmission Sequence Numbers (TSNs) to Detect Spurious + Retransmissions" (February 2004) + + Abstract: "TCP and Stream Control Transmission Protocol (SCTP) + provide notification of duplicate segment receipt through + Duplicate Selective Acknowledgement (DSACKs) and Duplicate + Transmission Sequence Number (TSN) notification, respectively. + This document presents conservative methods of using this + information to identify unnecessary retransmissions for various + applications." [RFC3708] + + + + + + + + + +Duke, et al. Informational [Page 11] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 3742 E: "Limited Slow-Start for TCP with Large Congestion + Windows" (March 2004) + + This document [RFC3742] describes a more conservative slow-start + behavior to prevent massive packet losses when a connection uses a + very large window. + + RFC 4015 S: "The Eifel Response Algorithm for TCP" (February 2005) + + This document [RFC4015] describes the response portion of the + Eifel algorithm, which can be used in conjunction with one of + several methods of detecting when loss recovery has been + spuriously entered, such as the Eifel detection algorithm in RFC + 3522, the algorithm in RFC 3708, or F-RTO in RFC 4138. + + Abstract: "Based on an appropriate detection algorithm, the Eifel + response algorithm provides a way for a TCP sender to respond to a + detected spurious timeout. It adapts the retransmission timer to + avoid further spurious timeouts, and can avoid - depending on the + detection algorithm - the often unnecessary go-back-N retransmits + that would otherwise be sent. In addition, the Eifel response + algorithm restores the congestion control state in such a way that + packet bursts are avoided." + + RFC 4015 is itself a Proposed Standard. The consensus of the TCPM + working group was to place it in this section of the roadmap + document due to three factors. + + 1. RFC 4015 operates on the output of a detection algorithm, for + which there is currently no available mechanism on the + standards track. + + 2. The working group was not aware of any wide deployment and use + of RFC 4015. + + 3. The consensus of the working group, after a discussion of the + known Intellectual Property Rights claims on the techniques + described in RFC 4015, identified this section of the roadmap + as an appropriate location. + + RFC 4138 E: "Forward RTO-Recovery (F-RTO): An Algorithm for Detecting + Spurious Retransmission Timeouts with TCP and the Stream Control + Transmission Protocol" (August 2005) + + The F-RTO detection algorithm [RFC4138] provides another option + for inferring spurious retransmission timeouts. Unlike some + similar detection methods, F-RTO does not rely on the use of any + TCP options. + + + +Duke, et al. Informational [Page 12] + +RFC 4614 TCP Roadmap September 2006 + + +5. Historic Extensions + + The RFCs listed here define extensions that have thus far failed to + arouse substantial interest from implementers, or that were found to + be defective for general use. + + RFC 1106 "TCP Big Window and NAK Options" (June 1989): found + defective + + This RFC [RFC1106] defined an alternative to the Window Scale + option for using large windows and described the "negative + acknowledgement" or NAK option. There is a comparison of NAK and + SACK methods, and early discussion of TCP over satellite issues. + RFC 1110 explains some problems with the approaches described in + RFC 1106. The options described in this document have not been + adopted by the larger community, although NAKs are used in the + SCPS-TP adaptation of TCP for satellite and spacecraft use, + developed by the Consultative Committee for Space Data Systems + (CCSDS). + + RFC 1110 "A Problem with the TCP Big Window Option" (August 1989): + deprecates RFC 1106 + + Abstract: "The TCP Big Window option discussed in RFC 1106 will + not work properly in an Internet environment which has both a high + bandwidth * delay product and the possibility of disordering and + duplicating packets. In such networks, the window size must not + be increased without a similar increase in the sequence number + space. Therefore, a different approach to big windows should be + taken in the Internet." [RFC1110] + + RFC 1146 E "TCP Alternate Checksum Options" (March 1990): lack of + interest + + This document [RFC1146] defined more robust TCP checksums than the + 16-bit ones-complement in use today. A typographical error in RFC + 1145 is fixed in RFC 1146; otherwise, the documents are the same. + + RFC 1263 "TCP Extensions Considered Harmful" (October 1991) - lack of + interest + + This document [RFC1263] argues against "backwards compatible" TCP + extensions. Specifically mentioned are several TCP enhancements + that have been successful, including timestamps, window scaling, + PAWS, and SACK. RFC 1263 presents an alternative approach called + "protocol evolution", whereby several evolutionary versions of TCP + would exist on hosts. These distinct TCP versions would represent + upgrades to each other and could be header-incompatible. + + + +Duke, et al. Informational [Page 13] + +RFC 4614 TCP Roadmap September 2006 + + + Interoperability would be provided by having a virtualization + layer select the right TCP version for a particular connection. + This idea did not catch on with the community, although the type + of extensions RFC 1263 specifically targeted as harmful did become + popular. + + RFC 1379 I "Extending TCP for Transactions -- Concepts" (November + 1992): found defective + + See RFC 1644. + + RFC 1644 E "T/TCP -- TCP Extensions for Transactions Functional + Specification" (July 1994): found defective + + The inventors of TCP believed that cached connection state could + have been used to eliminate TCP's 3-way handshake, to support + two-packet request/response exchanges. RFCs 1379 [RFC1379] and + 1644 [RFC1644] show that this is far from simple. Furthermore, + T/TCP floundered on the ease of denial-of-service attacks that can + result. One idea pioneered by T/TCP lives on in RFC 2140, in the + sharing of state across connections. + + RFC 1693 E "An Extension to TCP: Partial Order Service" (November + 1994): lack of interest + + This document [RFC1693] defines a TCP extension for applications + that do not care about the order in which application-layer + objects are received. Examples are multimedia and database + applications. In practice, these applications either accept the + possible performance loss because of TCP's strict ordering or use + more specialized transport protocols. + +6. Support Documents + + This section contains several classes of documents that do not + necessarily define current protocol behaviors, but that are + nevertheless of interest to TCP implementers. Section 6.1 describes + several foundational RFCs that give modern readers a better + understanding of the principles underlying TCP's behaviors and + development over the years. The documents listed in Section 6.2 + provide advice on using TCP in various types of network situations + that pose challenges above those of typical wired links. Some + implementation notes can be found in Section 6.3. The TCP Management + Information Bases are described in Section 6.4. RFCs that describe + tools for testing and debugging TCP implementations or that contain + high-level tutorials on the protocol are listed Section 6.5, and + Section 6.6 lists a number of case studies that have explored TCP + performance. + + + +Duke, et al. Informational [Page 14] + +RFC 4614 TCP Roadmap September 2006 + + +6.1. Foundational Works + + The documents listed in this section contain information that is + largely duplicated by the standards documents previously discussed. + However, some of them contain a greater depth of problem statement + explanation or other context. Particularly, RFCs 813 - 817 (known as + the "Dave Clark Five") describe some early problems and solutions + (RFC 815 only describes the reassembly of IP fragments and is not + included in this TCP roadmap). + + RFC 813: "Window and Acknowledgement Strategy in TCP" (July 1982) + + This document [RFC0813] contains an early discussion of Silly + Window Syndrome and its avoidance and motivates and describes the + use of delayed acknowledgments. + + RFC 814: "Name, Addresses, Ports, and Routes" (July 1982) + + Suggestions and guidance for the design of tables and algorithms + to keep track of various identifiers within a TCP/IP + implementation are provided by this document [RFC0814]. + + RFC 816: "Fault Isolation and Recovery" (July 1982) + + In this document [RFC0816], TCP's response to indications of + network error conditions such as timeouts or received ICMP + messages is discussed. + + RFC 817: "Modularity and Efficiency in Protocol Implementation" (July + 1982) + + This document [RFC0817] contains implementation suggestions that + are general and not TCP specific. However, they have been used to + develop TCP implementations and to describe some performance + implications of the interactions between various layers in the + Internet stack. + + RFC 872: "TCP-ON-A-LAN" (September 1982) + + Conclusion: "The sometimes-expressed fear that using TCP on a + local net is a bad idea is unfounded." [RFC0872] + + RFC 896: "Congestion Control in IP/TCP Internetworks" (January 1984) + + This document [RFC0896] contains some early experiences with + congestion collapse and some initial thoughts on how to avoid it + using congestion control in TCP. + + + + +Duke, et al. Informational [Page 15] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 964: "Some Problems with the Specification of the Military + Standard Transmission Control Protocol" (November 1985) + + This document [RFC0964] points out several specification bugs in + the US Military's MIL-STD-1778 document, which was intended as a + successor to RFC 793. This serves to remind us of the difficulty + in specification writing (even when we work from existing + documents!). + + RFC 1072: "TCP Extensions for Long-Delay Paths" (October 1988) + + This document [RFC1072] contains early explanations of the + mechanisms that were later described by RFCs 1323 and 2018, which + obsolete it. + + RFC 1185: "TCP Extension for High-Speed Paths" (October 1990) + + This document [RFC1185] builds on RFC 1072 to describe more + advanced strategies for dealing with sequence number wrapping and + detecting duplicates from earlier connections. This document was + obsoleted by RFC 1323. + + RFC 2914 B: "Congestion Control Principles" (September 2000) + + This document [RFC2914] motivates the use of end-to-end congestion + control for preventing congestion collapse and providing fairness + to TCP. + +6.2. Difficult Network Environments + + As the internetworking field has explored wireless, satellite, + cellular telephone, and other kinds of link-layer technologies, a + large body of work has built up on enhancing TCP performance for such + links. The RFCs listed in this section describe some of these more + challenging network environments and how TCP interacts with them. + + RFC 2488 B: "Enhancing TCP Over Satellite Channels using Standard + Mechanisms" (January 1999) + + From abstract: "While TCP works over satellite channels there are + several IETF standardized mechanisms that enable TCP to more + effectively utilize the available capacity of the network path. + This document outlines some of these TCP mitigations. At this + time, all mitigations discussed in this document are IETF + standards track mechanisms (or are compliant with IETF + standards)." [RFC2488] + + + + + +Duke, et al. Informational [Page 16] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 2757 I: "Long Thin Networks" (January 2000) + + Several methods of improving TCP performance over long thin + networks, such as geosynchronous satellite links, are discussed in + this document [RFC2757]. A particular set of TCP options is + developed that should work well in such environments and be safe + to use in the global Internet. The implications of such + environments have been further discussed in RFC 3150 and RFC 3155, + and these documents should be preferred where there is overlap + between them and RFC 2757. + + RFC 2760 I: "Ongoing TCP Research Related to Satellites" (February + 2000) + + This document [RFC2760] discusses the advantages and disadvantages + of several different experimental means of improving TCP + performance over long-delay or error-prone paths. These include + T/TCP, larger initial windows, byte counting, delayed + acknowledgments, slow start thresholds, NewReno and SACK-based + loss recovery, FACK [MM96], ECN, various corruption-detection + mechanisms, congestion avoidance changes for fairness, use of + multiple parallel flows, pacing, header compression, state + sharing, and ACK congestion control, filtering, and + reconstruction. Although RFC 2488 looks at standard extensions, + this document focuses on more experimental means of performance + enhancement. + + RFC 3135 I: "Performance Enhancing Proxies Intended to Mitigate + Link-Related Degradations" (June 2001) + + From abstract: "This document is a survey of Performance Enhancing + Proxies (PEPs) often employed to improve degraded TCP performance + caused by characteristics of specific link environments, for + example, in satellite, wireless WAN, and wireless LAN + environments. Different types of Performance Enhancing Proxies + are described as well as the mechanisms used to improve + performance." [RFC3135] + + + + + + + + + + + + + + +Duke, et al. Informational [Page 17] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 3150 B: "End-to-end Performance Implications of Slow Links" (July + 2001) + + From abstract: "This document makes performance-related + recommendations for users of network paths that traverse "very low + bit-rate" links....This recommendation may be useful in any + network where hosts can saturate available bandwidth, but the + design space for this recommendation explicitly includes + connections that traverse 56 Kb/second modem links or 4.8 Kb/ + second wireless access links - both of which are widely deployed." + [RFC3150] + + RFC 3155 B: "End-to-end Performance Implications of Links with + Errors" (August 2001) + + From abstract: "This document discusses the specific TCP + mechanisms that are problematic in environments with high + uncorrected error rates, and discusses what can be done to + mitigate the problems without introducing intermediate devices + into the connection." [RFC3155] + + RFC 3366 "Advice to link designers on link Automatic Repeat reQuest + (ARQ)" (August 2002) + + From abstract: "This document provides advice to the designers of + digital communication equipment and link-layer protocols employing + link-layer Automatic Repeat reQuest (ARQ) techniques. This + document presumes that the designers wish to support Internet + protocols, but may be unfamiliar with the architecture of the + Internet and with the implications of their design choices for the + performance and efficiency of Internet traffic carried over their + links." [RFC3366] + + RFC 3449 B: "TCP Performance Implications of Network Path Asymmetry" + (December 2002) + + From abstract: "This document describes TCP performance problems + that arise because of asymmetric effects. These problems arise in + several access networks, including bandwidth-asymmetric networks + and packet radio subnetworks, for different underlying reasons. + However, the end result on TCP performance is the same in both + cases: performance often degrades significantly because of + imperfection and variability in the ACK feedback from the receiver + to the sender. + + The document details several mitigations to these effects, which + have either been proposed or evaluated in the literature, or are + currently deployed in networks." [RFC3449] + + + +Duke, et al. Informational [Page 18] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 3481 B: "TCP over Second (2.5G) and Third (3G) Generation + Wireless Networks" (February 2003) + + From abstract: "This document describes a profile for optimizing + TCP to adapt so that it handles paths including second (2.5G) and + third (3G) generation wireless networks." [RFC3481] + + RFC 3819 B: "Advice for Internet Subnetwork Designers" (July 2004) + + This document [RFC3819] describes how TCP performance can be + negatively affected by some particular lower-layer behaviors and + provides guidance in designing lower-layer networks and protocols + to be amicable to TCP. + +6.3. Implementation Advice + + RFC 879: "The TCP Maximum Segment Size and Related Topics" (November + 1983) + + Abstract: "This memo discusses the TCP Maximum Segment Size Option + and related topics. The purposes is to clarify some aspects of + TCP and its interaction with IP. This memo is a clarification to + the TCP specification, and contains information that may be + considered as 'advice to implementers'." [RFC0879] + + RFC 1071: "Computing the Internet Checksum" (September 1988) + + This document [RFC1071] lists a number of implementation + techniques for efficiently computing the Internet checksum (used + by TCP). + + RFC 1624 I: "Computation of the Internet Checksum via Incremental + Update" (May 1994) + + Incrementally updating the Internet checksum is useful to routers + in updating IP checksums. Some middleboxes that alter TCP headers + may also be able to update the TCP checksum incrementally. This + document [RFC1624] expands upon the explanation of the incremental + update procedure in RFC 1071. + + RFC 1936 I: "Implementing the Internet Checksum in Hardware" (April + 1996) + + This document [RFC1936] describes the motivation for implementing + the Internet checksum in hardware, rather than in software, and + provides an implementation example. + + + + + +Duke, et al. Informational [Page 19] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 2525 I: "Known TCP Implementation Problems" (March 1999) + + From abstract: "This memo catalogs a number of known TCP + implementation problems. The goal in doing so is to improve + conditions in the existing Internet by enhancing the quality of + current TCP/IP implementations." [RFC2525] + + RFC 2923 I: "TCP Problems with Path MTU Discovery" (September 2000) + + From abstract: "This memo catalogs several known Transmission + Control Protocol (TCP) implementation problems dealing with Path + Maximum Transmission Unit Discovery (PMTUD), including the long- + standing black hole problem, stretch acknowlegements (ACKs) due to + confusion between Maximum Segment Size (MSS) and segment size, and + MSS advertisement based on PMTU." [RFC2923] + + RFC 3360 B: "Inappropriate TCP Resets Considered Harmful" (August + 2002) + + This document [RFC3360] is a plea that firewall vendors not send + gratuitous TCP RST (Reset) packets when unassigned TCP header bits + are used. This practice prevents desirable extension and + evolution of the protocol and thus is potentially harmful to the + future of the Internet. + + RFC 3493 I: "Basic Socket Interface Extensions for IPv6" (February + 2003) + + This document [RFC3493] describes the de facto standard sockets + API for programming with TCP. This API is implemented nearly + ubiquitously in modern operating systems and programming + languages. + +6.4. Management Information Bases + + The first MIB module defined for use with Simple Network Management + Protocol (SNMP) (in RFC 1066 and its update, RFC 1156) was a single + monolithic MIB module, called MIB-I. This evolved over time to be + MIB-II (RFC 1213). It then became apparent that having a single + monolithic MIB module was not scalable, given the number and breadth + of MIB data definitions that needed to be included. Thus, additional + MIB modules were defined, and those parts of MIB-II that needed to + evolve were split off. Eventually, the remaining parts of MIB-II + were also split off, the TCP-specific part being documented in RFC + 2012. + + + + + + +Duke, et al. Informational [Page 20] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 2012 was obsoleted by RFC 4022, which is the primary TCP MIB + document today. MIB-I, defined in RFC 1156, has been obsoleted by + the MIB-II specification in RFC 1213. For current TCP implementers, + RFC 4022 should be supported. + + RFC 1066: "Management Information Base for Network Management of + TCP/IP-based Internets" (August 1988) + + This document [RFC1066] was the description of the TCP MIB. It + was obsoleted by RFC 1156. + + RFC 1156 S: "Management Information Base for Network Management of + TCP/IP-based Internets" (May 1990) + + This document [RFC1156] describes the required MIB fields for TCP + implementations, with minor corrections and no technical changes + from RFC 1066, which it obsoletes. This is the standards track + document for MIB-I. + + RFC 1213 S: "Management Information Base for Network Management of + TCP/IP-based Internets: MIB-II" (March 1991) + + This document [RFC1213] describes the second version of the MIB in + a monolithic form. RFC 2012 updates this document by splitting + out the TCP-specific portions. + + RFC 2012 S: "SNMPv2 Management Information Base for the Transmission + Control Protocol using SMIv2" (November 1996) + + This document [RFC2012] defined the TCP MIB, in an update to RFC + 1213. It is now obsoleted by RFC 4022. + + RFC 2452 S: "IP Version 6 Management Information Base for the + Transmission Control Protocol" (December 1998) + + This document [RFC2452] augments RFC 2012 by adding an IPv6- + specific connection table. The rest of 2012 holds for any IP + version. RFC 2012 is now obsoleted by RFC 4022. + + Although it is a standards track document, RFC 2452 is considered + a historic mistake by the MIB community, as it is based on the + idea of parallel IPv4 and IPv6 structures. Although IPv6 requires + new structures, the community has decided to define a single + generic structure for both IPv4 and IPv6. This will aid in + definition, implementation, and transition between IPv4 and IPv6. + + + + + + +Duke, et al. Informational [Page 21] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 4022 S: "Management Information Base for the Transmission Control + Protocol (TCP)" (March 2005) + + This document [RFC4022] obsoletes RFC 2012 and RFC 2452 and + specifies the current standard for the TCP MIB that should be + deployed. + +6.5. Tools and Tutorials + + RFC 1180 I: "TCP/IP Tutorial" (January 1991) + + This document [RFC1180] is an extremely brief overview of the + TCP/IP protocol suite as a whole. It gives some explanation as to + how and where TCP fits in. + + RFC 1470 I: "FYI on a Network Management Tool Catalog: Tools for + Monitoring and Debugging TCP/IP Internets and Interconnected Devices" + (June 1993) + + A few of the tools that this document [RFC1470] describes are + still maintained and in use today; for example, ttcp and tcpdump. + However, many of the tools described do not relate specifically to + TCP and are no longer used or easily available. + + RFC 2398 I: "Some Testing Tools for TCP Implementors" (August 1998) + + This document [RFC2398] describes a number of TCP packet + generation and analysis tools. Although some of these tools are + no longer readily available or widely used, for the most part they + are still relevant and usable. + +6.6. Case Studies + + RFC 1337 I: "TIME-WAIT Assassination Hazards in TCP" (May 1992) + + This document [RFC1337] points out a problem with acting on + received reset segments while one is in the TIME-WAIT state. The + main recommendation is that hosts in TIME-WAIT ignore resets. + This recommendation might not currently be widely implemented. + + RFC 2415 I: "Simulation Studies of Increased Initial TCP Window Size" + (September 1998) + + This document [RFC2415] presents results of some simulations using + TCP initial windows greater than 1 segment. The analysis + indicates that user-perceived performance can be improved by + increasing the initial window to 3 segments. + + + + +Duke, et al. Informational [Page 22] + +RFC 4614 TCP Roadmap September 2006 + + + RFC 2416 I: "When TCP Starts Up With Four Packets Into Only Three + Buffers" (September 1998) + + This document [RFC2416] uses simulation results to clear up some + concerns about using an initial window of 4 segments when the + network path has less provisioning. + + RFC 2884 I: "Performance Evaluation of Explicit Congestion + Notification (ECN) in IP Networks" (July 2000) + + This document [RFC2884] describes experimental results that show + some improvements to the performance of both short- and long-lived + connections due to ECN. + +7. Undocumented TCP Features + + There are a few important implementation tactics for the TCP that + have not yet been described in any RFC. Although this roadmap is + primarily concerned with mapping the TCP RFCs, this section is + included because an implementer needs to be aware of these important + issues. + + SYN Cookies + + A mechanism known as "SYN cookies" is widely used to thwart TCP + SYN flooding attacks, in which an attacker sends a flood of SYNs + to a victim but fails to complete the 3-way handshake. The result + is exhaustion of resources at the server. The SYN cookie + mechanism allows the server to return a cleverly chosen initial + sequence number that has all the required state for the secure + completion of the handshake. Then the server can avoid saving + connection state during the 3-way handshake and thus survive a SYN + flooding attack. + + A web search for "SYN cookies" will reveal a number of useful + descriptions of this mechanism, although there is currently no RFC + on the matter. + + Header Prediction + + Header prediction is a trick to speed up the processing of + segments. Van Jacobson and Mike Karels developed the technique in + the late 1980s. The basic idea is that some processing time can + be saved when most of a segment's fields can be predicted from + previous segments. A good description of this was sent to the + TCP-IP mailing list by Van Jacobson on March 9, 1988: + + + + + +Duke, et al. Informational [Page 23] + +RFC 4614 TCP Roadmap September 2006 + + + Quite a bit of the speedup comes from an algorithm that we + ('we' refers to collaborator Mike Karels and myself) are + calling "header prediction". The idea is that if you're in the + middle of a bulk data transfer and have just seen a packet, you + know what the next packet is going to look like: It will look + just like the current packet with either the sequence number or + ack number updated (depending on whether you're the sender or + receiver). Combining this with the "Use hints" epigram from + Butler Lampson's classic "Epigrams for System Designers", you + start to think of the tcp state (rcv.nxt, snd.una, etc.) as + "hints" about what the next packet should look like. + + If you arrange those "hints" so they match the layout of a tcp + packet header, it takes a single 14-byte compare to see if your + prediction is correct (3 longword compares to pick up the send + & ack sequence numbers, header length, flags and window, plus a + short compare on the length). If the prediction is correct, + there's a single test on the length to see if you're the sender + or receiver followed by the appropriate processing. E.g., if + the length is non-zero (you're the receiver), checksum and + append the data to the socket buffer then wake any process + that's sleeping on the buffer. Update rcv.nxt by the length of + this packet (this updates your "prediction" of the next + packet). Check if you can handle another packet the same size + as the current one. If not, set one of the unused flag bits in + your header prediction to guarantee that the prediction will + fail on the next packet and force you to go through full + protocol processing. Otherwise, you're done with this packet. + So, the *total* tcp protocol processing, exclusive of + checksumming, is on the order of 6 compares and an add. + +8. Security Considerations + + This document introduces no new security considerations. Each RFC + listed in this document attempts to address the security + considerations of the specification it contains. + +9. Acknowledgments + + This document grew out of a discussion on the end2end-interest + mailing list, the public list of the End-to-End Research Group of the + IRTF, and continued development under the IETF's TCP Maintenance and + Minor Extensions (TCPM) working group. We thank Joe Touch, Reiner + Ludwig, Pekka Savola, Gorry Fairhurst, and Sally Floyd for their + contributions, in particular. The chairs of the TCPM working group, + Mark Allman and Ted Faber, have been instrumental in the development + of this document. Keith McCloghrie provided some useful notes and + clarification on the various MIB-related RFCs. + + + +Duke, et al. Informational [Page 24] + +RFC 4614 TCP Roadmap September 2006 + + +10. Informative References + +10.1. Basic Functionality + + [RFC0793] Postel, J., "Transmission Control Protocol", STD 7, RFC + 793, September 1981. + + [RFC1122] Braden, R., "Requirements for Internet Hosts - + Communication Layers", STD 3, RFC 1122, October 1989. + + [RFC2026] Bradner, S., "The Internet Standards Process -- Revision + 3", BCP 9, RFC 2026, October 1996. + + [RFC2460] Deering, S. and R. Hinden, "Internet Protocol, Version 6 + (IPv6) Specification", RFC 2460, December 1998. + + [RFC2474] Nichols, K., Blake, S., Baker, F., and D. Black, + "Definition of the Differentiated Services Field (DS + Field) in the IPv4 and IPv6 Headers", RFC 2474, December + 1998. + + [RFC2581] Allman, M., Paxson, V., and W. Stevens, "TCP Congestion + Control", RFC 2581, April 1999. + + [RFC2675] Borman, D., Deering, S., and R. Hinden, "IPv6 Jumbograms", + RFC 2675, August 1999. + + [RFC2873] Xiao, X., Hannan, A., Paxson, V., and E. Crabbe, "TCP + Processing of the IPv4 Precedence Field", RFC 2873, June + 2000. + + [RFC2988] Paxson, V. and M. Allman, "Computing TCP's Retransmission + Timer", RFC 2988, November 2000. + +10.2. Recommended Enhancements + + [RFC1323] Jacobson, V., Braden, R., and D. Borman, "TCP Extensions + for High Performance", RFC 1323, May 1992. + + [RFC1948] Bellovin, S., "Defending Against Sequence Number Attacks", + RFC 1948, May 1996. + + [RFC2018] Mathis, M., Mahdavi, J., Floyd, S., and A. Romanow, "TCP + Selective Acknowledgment Options", RFC 2018, October 1996. + + [RFC2385] Heffernan, A., "Protection of BGP Sessions via the TCP MD5 + Signature Option", RFC 2385, August 1998. + + + + +Duke, et al. Informational [Page 25] + +RFC 4614 TCP Roadmap September 2006 + + + [RFC2883] Floyd, S., Mahdavi, J., Mathis, M., and M. Podolsky, "An + Extension to the Selective Acknowledgement (SACK) Option + for TCP", RFC 2883, July 2000. + + [RFC3042] Allman, M., Balakrishnan, H., and S. Floyd, "Enhancing + TCP's Loss Recovery Using Limited Transmit", RFC 3042, + January 2001. + + [RFC3168] Ramakrishnan, K., Floyd, S., and D. Black, "The Addition + of Explicit Congestion Notification (ECN) to IP", RFC + 3168, September 2001. + + [RFC3390] Allman, M., Floyd, S., and C. Partridge, "Increasing TCP's + Initial Window", RFC 3390, October 2002. + + [RFC3517] Blanton, E., Allman, M., Fall, K., and L. Wang, "A + Conservative Selective Acknowledgment (SACK)-based Loss + Recovery Algorithm for TCP", RFC 3517, April 2003. + + [RFC3562] Leech, M., "Key Management Considerations for the TCP MD5 + Signature Option", RFC 3562, July 2003. + + [RFC3782] Floyd, S., Henderson, T., and A. Gurtov, "The NewReno + Modification to TCP's Fast Recovery Algorithm", RFC 3782, + April 2004. + + [RFC4015] Ludwig, R. and A. Gurtov, "The Eifel Response Algorithm + for TCP", RFC 4015, February 2005. + + [RFC4278] Bellovin, S. and A. Zinin, "Standards Maturity Variance + Regarding the TCP MD5 Signature Option (RFC 2385) and the + BGP-4 Specification", RFC 4278, January 2006. + +10.3. Experimental Extensions + + [RFC2140] Touch, J., "TCP Control Block Interdependence", RFC 2140, + April 1997. + + [RFC2861] Handley, M., Padhye, J., and S. Floyd, "TCP Congestion + Window Validation", RFC 2861, June 2000. + + [RFC3124] Balakrishnan, H. and S. Seshan, "The Congestion Manager", + RFC 3124, June 2001. + + [RFC3465] Allman, M., "TCP Congestion Control with Appropriate Byte + Counting (ABC)", RFC 3465, February 2003. + + + + + +Duke, et al. Informational [Page 26] + +RFC 4614 TCP Roadmap September 2006 + + + [RFC3522] Ludwig, R. and M. Meyer, "The Eifel Detection Algorithm + for TCP", RFC 3522, April 2003. + + [RFC3540] Spring, N., Wetherall, D., and D. Ely, "Robust Explicit + Congestion Notification (ECN) Signaling with Nonces", RFC + 3540, June 2003. + + [RFC3649] Floyd, S., "HighSpeed TCP for Large Congestion Windows", + RFC 3649, December 2003. + + [RFC3708] Blanton, E. and M. Allman, "Using TCP Duplicate Selective + Acknowledgement (DSACKs) and Stream Control Transmission + Protocol (SCTP) Duplicate Transmission Sequence Numbers + (TSNs) to Detect Spurious Retransmissions", RFC 3708, + February 2004. + + [RFC3742] Floyd, S., "Limited Slow-Start for TCP with Large + Congestion Windows", RFC 3742, March 2004. + + [RFC4138] Sarolahti, P. and M. Kojo, "Forward RTO-Recovery (F-RTO): + An Algorithm for Detecting Spurious Retransmission + Timeouts with TCP and the Stream Control Transmission + Protocol (SCTP)", RFC 4138, August 2005. + +10.4. Historic Extensions + + [RFC1106] Fox, R., "TCP big window and NAK options", RFC 1106, June + 1989. + + [RFC1110] McKenzie, A., "Problem with the TCP big window option", + RFC 1110, August 1989. + + [RFC1146] Zweig, J. and C. Partridge, "TCP alternate checksum + options", RFC 1146, March 1990. + + [RFC1263] O'Malley, S. and L. Peterson, "TCP Extensions Considered + Harmful", RFC 1263, October 1991. + + [RFC1379] Braden, R., "Extending TCP for Transactions -- Concepts", + RFC 1379, November 1992. + + [RFC1644] Braden, R., "T/TCP -- TCP Extensions for Transactions + Functional Specification", RFC 1644, July 1994. + + [RFC1693] Connolly, T., Amer, P., and P. Conrad, "An Extension to + TCP : Partial Order Service", RFC 1693, November 1994. + + + + + +Duke, et al. Informational [Page 27] + +RFC 4614 TCP Roadmap September 2006 + + +10.5. Support Documents + + [RFC0813] Clark, D., "Window and Acknowledgement Strategy in TCP", + RFC 813, July 1982. + + [RFC0814] Clark, D., "Name, addresses, ports, and routes", RFC 814, + July 1982. + + [RFC0816] Clark, D., "Fault isolation and recovery", RFC 816, July + 1982. + + [RFC0817] Clark, D., "Modularity and efficiency in protocol + implementation", RFC 817, July 1982. + + [RFC0872] Padlipsky, M., "TCP-on-a-LAN", RFC 872, September 1982. + + [RFC0879] Postel, J., "TCP maximum segment size and related topics", + RFC 879, November 1983. + + [RFC0896] Nagle, J., "Congestion control in IP/TCP internetworks", + RFC 896, January 1984. + + [RFC0964] Sidhu, D. and T. Blumer, "Some problems with the + specification of the Military Standard Transmission + Control Protocol", RFC 964, November 1985. + + [RFC1066] McCloghrie, K. and M. Rose, "Management Information Base + for Network Management of TCP/IP-based internets", RFC + 1066, August 1988. + + [RFC1071] Braden, R., Borman, D., and C. Partridge, "Computing the + Internet checksum", RFC 1071, September 1988. + + [RFC1072] Jacobson, V. and R. Braden, "TCP extensions for long-delay + paths", RFC 1072, October 1988. + + [RFC1156] McCloghrie, K. and M. Rose, "Management Information Base + for network management of TCP/IP-based internets", RFC + 1156, May 1990. + + [RFC1180] Socolofsky, T. and C. Kale, "TCP/IP tutorial", RFC 1180, + January 1991. + + [RFC1185] Jacobson, V., Braden, B., and L. Zhang, "TCP Extension for + High-Speed Paths", RFC 1185, October 1990. + + + + + + +Duke, et al. Informational [Page 28] + +RFC 4614 TCP Roadmap September 2006 + + + [RFC1213] McCloghrie, K. and M. Rose, "Management Information Base + for Network Management of TCP/IP-based internets: MIB-II", + STD 17, RFC 1213, March 1991. + + [RFC1337] Braden, R., "TIME-WAIT Assassination Hazards in TCP", RFC + 1337, May 1992. + + [RFC1470] Enger, R. and J. Reynolds, "FYI on a Network Management + Tool Catalog: Tools for Monitoring and Debugging TCP/IP + Internets and Interconnected Devices", FYI 2, RFC 1470, + June 1993. + + [RFC1624] Rijsinghani, A., "Computation of the Internet Checksum via + Incremental Update", RFC 1624, May 1994. + + [RFC1936] Touch, J. and B. Parham, "Implementing the Internet + Checksum in Hardware", RFC 1936, April 1996. + + [RFC2012] McCloghrie, K., "SNMPv2 Management Information Base for + the Transmission Control Protocol using SMIv2", RFC 2012, + November 1996. + + [RFC2398] Parker, S. and C. Schmechel, "Some Testing Tools for TCP + Implementors", RFC 2398, August 1998. + + [RFC2415] Poduri, K. and K. Nichols, "Simulation Studies of + Increased Initial TCP Window Size", RFC 2415, September + 1998. + + [RFC2416] Shepard, T. and C. Partridge, "When TCP Starts Up With + Four Packets Into Only Three Buffers", RFC 2416, September + 1998. + + [RFC2452] Daniele, M., "IP Version 6 Management Information Base for + the Transmission Control Protocol", RFC 2452, December + 1998. + + [RFC2488] Allman, M., Glover, D., and L. Sanchez, "Enhancing TCP + Over Satellite Channels using Standard Mechanisms", BCP + 28, RFC 2488, January 1999. + + [RFC2525] Paxson, V., Allman, M., Dawson, S., Fenner, W., Griner, + J., Heavens, I., Lahey, K., Semke, J., and B. Volz, "Known + TCP Implementation Problems", RFC 2525, March 1999. + + [RFC2757] Montenegro, G., Dawkins, S., Kojo, M., Magret, V., and N. + Vaidya, "Long Thin Networks", RFC 2757, January 2000. + + + + +Duke, et al. Informational [Page 29] + +RFC 4614 TCP Roadmap September 2006 + + + [RFC2760] Allman, M., Dawkins, S., Glover, D., Griner, J., Tran, D., + Henderson, T., Heidemann, J., Touch, J., Kruse, H., + Ostermann, S., Scott, K., and J. Semke, "Ongoing TCP + Research Related to Satellites", RFC 2760, February 2000. + + [RFC2884] Hadi Salim, J. and U. Ahmed, "Performance Evaluation of + Explicit Congestion Notification (ECN) in IP Networks", + RFC 2884, July 2000. + + [RFC2914] Floyd, S., "Congestion Control Principles", BCP 41, RFC + 2914, September 2000. + + [RFC2923] Lahey, K., "TCP Problems with Path MTU Discovery", RFC + 2923, September 2000. + + [RFC3135] Border, J., Kojo, M., Griner, J., Montenegro, G., and Z. + Shelby, "Performance Enhancing Proxies Intended to + Mitigate Link-Related Degradations", RFC 3135, June 2001. + + [RFC3150] Dawkins, S., Montenegro, G., Kojo, M., and V. Magret, + "End-to-end Performance Implications of Slow Links", BCP + 48, RFC 3150, July 2001. + + [RFC3155] Dawkins, S., Montenegro, G., Kojo, M., Magret, V., and N. + Vaidya, "End-to-end Performance Implications of Links with + Errors", BCP 50, RFC 3155, August 2001. + + [RFC3360] Floyd, S., "Inappropriate TCP Resets Considered Harmful", + BCP 60, RFC 3360, August 2002. + + [RFC3366] Fairhurst, G. and L. Wood, "Advice to link designers on + link Automatic Repeat reQuest (ARQ)", BCP 62, RFC 3366, + August 2002. + + [RFC3449] Balakrishnan, H., Padmanabhan, V., Fairhurst, G., and M. + Sooriyabandara, "TCP Performance Implications of Network + Path Asymmetry", BCP 69, RFC 3449, December 2002. + + [RFC3481] Inamura, H., Montenegro, G., Ludwig, R., Gurtov, A., and + F. Khafizov, "TCP over Second (2.5G) and Third (3G) + Generation Wireless Networks", BCP 71, RFC 3481, February + 2003. + + [RFC3493] Gilligan, R., Thomson, S., Bound, J., McCann, J., and W. + Stevens, "Basic Socket Interface Extensions for IPv6", RFC + 3493, February 2003. + + + + + +Duke, et al. Informational [Page 30] + +RFC 4614 TCP Roadmap September 2006 + + + [RFC3819] Karn, P., Bormann, C., Fairhurst, G., Grossman, D., + Ludwig, R., Mahdavi, J., Montenegro, G., Touch, J., and L. + Wood, "Advice for Internet Subnetwork Designers", BCP 89, + RFC 3819, July 2004. + + [RFC4022] Raghunarayan, R., "Management Information Base for the + Transmission Control Protocol (TCP)", RFC 4022, March + 2005. + +10.6. Informative References Outside the RFC Series + + [JK92] Jacobson, V. and M. Karels, "Congestion Avoidance and + Control", This paper is a revised version of [Jac88], that + includes an additional appendix. This paper has not been + traditionally published, but is currently available at + ftp://ftp.ee.lbl.gov/papers/congavoid.ps.Z. 1992. + + [Jac88] Jacobson, V., "Congestion Avoidance and Control", ACM + SIGCOMM 1988 Proceedings, in ACM Computer Communication + Review, 18 (4), pp. 314-329, August 1988. + + [KP87] Karn, P. and C. Partridge, "Round Trip Time Estimation", + ACM SIGCOMM 1987 Proceedings, in ACM Computer + Communication Review, 17 (5), pp. 2-7, August 1987 + + [MAF04] Medina, A., Allman, M., and S. Floyd, "Measuring the + Evolution of Transport Protocols in the Internet", ACM + Computer Communication Review, 35 (2), April 2005. + + [MM96] Mathis, M. and J. Mahdavi, "Forward Acknowledgement: + Refining TCP Congestion Control", ACM SIGCOMM 1996 + Proceedings, in ACM Computer Communication Review 26 (4), + pp. 281-292, October 1996. + + [SCWA99] Savage, S., Cardwell, N., Wetherall, D., and T. Anderson, + "TCP Congestion Control with a Misbehaving Receiver", ACM + Computer Communication Review, 29 (5), pp. 71-78, October + 1999. + + + + + + + + + + + + + +Duke, et al. Informational [Page 31] + +RFC 4614 TCP Roadmap September 2006 + + +Authors' Addresses + + Martin H. Duke + The Boeing Company + PO Box 3707, MC 7L-49 + Seattle, WA 98124-2207 + + Phone: 425-373-2852 + EMail: martin.duke@boeing.com + + + Robert Braden + USC Information Sciences Institute + Marina del Rey, CA 90292-6695 + + Phone: 310-448-9173 + EMail: braden@isi.edu + + + Wesley M. Eddy + Verizon Federal Network Systems + 21000 Brookpark Rd, MS 54-5 + Cleveland, OH 44135 + + Phone: 216-433-6682 + EMail: weddy@grc.nasa.gov + + + Ethan Blanton + Purdue University Computer Science + 250 N. University St. + West Lafayette, IN 47907 + + EMail: eblanton@cs.purdue.edu + + + + + + + + + + + + + + + + + +Duke, et al. Informational [Page 32] + +RFC 4614 TCP Roadmap September 2006 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2006). + + This document is subject to the rights, licenses and restrictions + contained in BCP 78, and except as set forth therein, the authors + retain all their rights. + + This document and the information contained herein are provided on an + "AS IS" basis and THE CONTRIBUTOR, THE ORGANIZATION HE/SHE REPRESENTS + OR IS SPONSORED BY (IF ANY), THE INTERNET SOCIETY AND THE INTERNET + ENGINEERING TASK FORCE DISCLAIM ALL WARRANTIES, EXPRESS OR IMPLIED, + INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE + INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED + WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Intellectual Property + + The IETF takes no position regarding the validity or scope of any + Intellectual Property Rights or other rights that might be claimed to + pertain to the implementation or use of the technology described in + this document or the extent to which any license under such rights + might or might not be available; nor does it represent that it has + made any independent effort to identify any such rights. Information + on the procedures with respect to rights in RFC documents can be + found in BCP 78 and BCP 79. + + Copies of IPR disclosures made to the IETF Secretariat and any + assurances of licenses to be made available, or the result of an + attempt made to obtain a general license or permission for the use of + such proprietary rights by implementers or users of this + specification can be obtained from the IETF on-line IPR repository at + http://www.ietf.org/ipr. + + The IETF invites any interested party to bring to its attention any + copyrights, patents or patent applications, or other proprietary + rights that may cover technology that may be required to implement + this standard. Please address the information to the IETF at + ietf-ipr@ietf.org. + +Acknowledgement + + Funding for the RFC Editor function is provided by the IETF + Administrative Support Activity (IASA). + + + + + + + +Duke, et al. Informational [Page 33] + diff --git a/ext/picotcp/RFC/rfc6762.txt b/ext/picotcp/RFC/rfc6762.txt new file mode 100644 index 0000000..2c44359 --- /dev/null +++ b/ext/picotcp/RFC/rfc6762.txt @@ -0,0 +1,3923 @@ + + + + + + +Internet Engineering Task Force (IETF) S. Cheshire +Request for Comments: 6762 M. Krochmal +Category: Standards Track Apple Inc. +ISSN: 2070-1721 February 2013 + + + Multicast DNS + +Abstract + + As networked devices become smaller, more portable, and more + ubiquitous, the ability to operate with less configured + infrastructure is increasingly important. In particular, the ability + to look up DNS resource record data types (including, but not limited + to, host names) in the absence of a conventional managed DNS server + is useful. + + Multicast DNS (mDNS) provides the ability to perform DNS-like + operations on the local link in the absence of any conventional + Unicast DNS server. In addition, Multicast DNS designates a portion + of the DNS namespace to be free for local use, without the need to + pay any annual fee, and without the need to set up delegations or + otherwise configure a conventional DNS server to answer for those + names. + + The primary benefits of Multicast DNS names are that (i) they require + little or no administration or configuration to set them up, (ii) + they work when no infrastructure is present, and (iii) they work + during infrastructure failures. + +Status of This Memo + + This is an Internet Standards Track document. + + This document is a product of the Internet Engineering Task Force + (IETF). It represents the consensus of the IETF community. It has + received public review and has been approved for publication by the + Internet Engineering Steering Group (IESG). Further information on + Internet Standards is available in Section 2 of RFC 5741. + + Information about the current status of this document, any errata, + and how to provide feedback on it may be obtained at + http://www.rfc-editor.org/info/rfc6762. + + + + + + + + +Cheshire & Krochmal Standards Track [Page 1] + +RFC 6762 Multicast DNS February 2013 + + +Copyright Notice + + Copyright (c) 2013 IETF Trust and the persons identified as the + document authors. All rights reserved. + + This document is subject to BCP 78 and the IETF Trust's Legal + Provisions Relating to IETF Documents + (http://trustee.ietf.org/license-info) in effect on the date of + publication of this document. Please review these documents + carefully, as they describe your rights and restrictions with respect + to this document. Code Components extracted from this document must + include Simplified BSD License text as described in Section 4.e of + the Trust Legal Provisions and are provided without warranty as + described in the Simplified BSD License. + + This document may contain material from IETF Documents or IETF + Contributions published or made publicly available before November + 10, 2008. The person(s) controlling the copyright in some of this + material may not have granted the IETF Trust the right to allow + modifications of such material outside the IETF Standards Process. + Without obtaining an adequate license from the person(s) controlling + the copyright in such materials, this document may not be modified + outside the IETF Standards Process, and derivative works of it may + not be created outside the IETF Standards Process, except to format + it for publication as an RFC or to translate it into languages other + than English. + + + + + + + + + + + + + + + + + + + + + + + + + +Cheshire & Krochmal Standards Track [Page 2] + +RFC 6762 Multicast DNS February 2013 + + +Table of Contents + + 1. Introduction ....................................................4 + 2. Conventions and Terminology Used in This Document ...............4 + 3. Multicast DNS Names .............................................5 + 4. Reverse Address Mapping .........................................7 + 5. Querying ........................................................8 + 6. Responding .....................................................13 + 7. Traffic Reduction ..............................................22 + 8. Probing and Announcing on Startup ..............................25 + 9. Conflict Resolution ............................................31 + 10. Resource Record TTL Values and Cache Coherency ................33 + 11. Source Address Check ..........................................38 + 12. Special Characteristics of Multicast DNS Domains ..............40 + 13. Enabling and Disabling Multicast DNS ..........................41 + 14. Considerations for Multiple Interfaces ........................42 + 15. Considerations for Multiple Responders on the Same Machine ....43 + 16. Multicast DNS Character Set ...................................45 + 17. Multicast DNS Message Size ....................................46 + 18. Multicast DNS Message Format ..................................47 + 19. Summary of Differences between Multicast DNS and Unicast DNS ..51 + 20. IPv6 Considerations ...........................................52 + 21. Security Considerations .......................................52 + 22. IANA Considerations ...........................................53 + 23. Acknowledgments ...............................................56 + 24. References ....................................................56 + Appendix A. Design Rationale for Choice of UDP Port Number ........60 + Appendix B. Design Rationale for Not Using Hashed Multicast + Addresses .............................................61 + Appendix C. Design Rationale for Maximum Multicast DNS Name + Length ................................................62 + Appendix D. Benefits of Multicast Responses .......................64 + Appendix E. Design Rationale for Encoding Negative Responses ......65 + Appendix F. Use of UTF-8 ..........................................66 + Appendix G. Private DNS Namespaces ................................67 + Appendix H. Deployment History ....................................67 + + + + + + + + + + + + + + + +Cheshire & Krochmal Standards Track [Page 3] + +RFC 6762 Multicast DNS February 2013 + + +1. Introduction + + Multicast DNS and its companion technology DNS-Based Service + Discovery [RFC6763] were created to provide IP networking with the + ease-of-use and autoconfiguration for which AppleTalk was well-known + [RFC6760]. When reading this document, familiarity with the concepts + of Zero Configuration Networking [Zeroconf] and automatic link-local + addressing [RFC3927] [RFC4862] is helpful. + + Multicast DNS borrows heavily from the existing DNS protocol + [RFC1034] [RFC1035] [RFC6195], using the existing DNS message + structure, name syntax, and resource record types. This document + specifies no new operation codes or response codes. This document + describes how clients send DNS-like queries via IP multicast, and how + a collection of hosts cooperate to collectively answer those queries + in a useful manner. + +2. Conventions and Terminology Used in This Document + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in "Key words for use in + RFCs to Indicate Requirement Levels" [RFC2119]. + + When this document uses the term "Multicast DNS", it should be taken + to mean: "Clients performing DNS-like queries for DNS-like resource + records by sending DNS-like UDP query and response messages over IP + Multicast to UDP port 5353". The design rationale for selecting UDP + port 5353 is discussed in Appendix A. + + This document uses the term "host name" in the strict sense to mean a + fully qualified domain name that has an IPv4 or IPv6 address record. + It does not use the term "host name" in the commonly used but + incorrect sense to mean just the first DNS label of a host's fully + qualified domain name. + + A DNS (or mDNS) packet contains an IP Time to Live (TTL) in the IP + header, which is effectively a hop-count limit for the packet, to + guard against routing loops. Each resource record also contains a + TTL, which is the number of seconds for which the resource record may + be cached. This document uses the term "IP TTL" to refer to the IP + header TTL (hop limit), and the term "RR TTL" or just "TTL" to refer + to the resource record TTL (cache lifetime). + + DNS-format messages contain a header, a Question Section, then + Answer, Authority, and Additional Record Sections. The Answer, + Authority, and Additional Record Sections all hold resource records + + + + +Cheshire & Krochmal Standards Track [Page 4] + +RFC 6762 Multicast DNS February 2013 + + + in the same format. Where this document describes issues that apply + equally to all three sections, it uses the term "Resource Record + Sections" to refer collectively to these three sections. + + This document uses the terms "shared" and "unique" when referring to + resource record sets [RFC1034]: + + A "shared" resource record set is one where several Multicast DNS + responders may have records with the same name, rrtype, and + rrclass, and several responders may respond to a particular query. + + A "unique" resource record set is one where all the records with + that name, rrtype, and rrclass are conceptually under the control + or ownership of a single responder, and it is expected that at + most one responder should respond to a query for that name, + rrtype, and rrclass. Before claiming ownership of a unique + resource record set, a responder MUST probe to verify that no + other responder already claims ownership of that set, as described + in Section 8.1, "Probing". (For fault-tolerance and other + reasons, sometimes it is permissible to have more than one + responder answering for a particular "unique" resource record set, + but such cooperating responders MUST give answers containing + identical rdata for these records. If they do not give answers + containing identical rdata, then the probing step will reject the + data as being inconsistent with what is already being advertised + on the network for those names.) + + Strictly speaking, the terms "shared" and "unique" apply to resource + record sets, not to individual resource records. However, it is + sometimes convenient to talk of "shared resource records" and "unique + resource records". When used this way, the terms should be + understood to mean a record that is a member of a "shared" or + "unique" resource record set, respectively. + +3. Multicast DNS Names + + A host that belongs to an organization or individual who has control + over some portion of the DNS namespace can be assigned a globally + unique name within that portion of the DNS namespace, such as, + "cheshire.example.com.". For those of us who have this luxury, this + works very well. However, the majority of home computer users do not + have easy access to any portion of the global DNS namespace within + which they have the authority to create names. This leaves the + majority of home computers effectively anonymous for practical + purposes. + + + + + + +Cheshire & Krochmal Standards Track [Page 5] + +RFC 6762 Multicast DNS February 2013 + + + To remedy this problem, this document allows any computer user to + elect to give their computers link-local Multicast DNS host names of + the form: "single-dns-label.local.". For example, a laptop computer + may answer to the name "MyComputer.local.". Any computer user is + granted the authority to name their computer this way, provided that + the chosen host name is not already in use on that link. Having + named their computer this way, the user has the authority to continue + utilizing that name until such time as a name conflict occurs on the + link that is not resolved in the user's favor. If this happens, the + computer (or its human user) MUST cease using the name, and SHOULD + attempt to allocate a new unique name for use on that link. These + conflicts are expected to be relatively rare for people who choose + reasonably imaginative names, but it is still important to have a + mechanism in place to handle them when they happen. + + This document specifies that the DNS top-level domain ".local." is a + special domain with special semantics, namely that any fully + qualified name ending in ".local." is link-local, and names within + this domain are meaningful only on the link where they originate. + This is analogous to IPv4 addresses in the 169.254/16 prefix or IPv6 + addresses in the FE80::/10 prefix, which are link-local and + meaningful only on the link where they originate. + + Any DNS query for a name ending with ".local." MUST be sent to the + mDNS IPv4 link-local multicast address 224.0.0.251 (or its IPv6 + equivalent FF02::FB). The design rationale for using a fixed + multicast address instead of selecting from a range of multicast + addresses using a hash function is discussed in Appendix B. + Implementers MAY choose to look up such names concurrently via other + mechanisms (e.g., Unicast DNS) and coalesce the results in some + fashion. Implementers choosing to do this should be aware of the + potential for user confusion when a given name can produce different + results depending on external network conditions (such as, but not + limited to, which name lookup mechanism responds faster). + + It is unimportant whether a name ending with ".local." occurred + because the user explicitly typed in a fully qualified domain name + ending in ".local.", or because the user entered an unqualified + domain name and the host software appended the suffix ".local." + because that suffix appears in the user's search list. The ".local." + suffix could appear in the search list because the user manually + configured it, or because it was received via DHCP [RFC2132] or via + any other mechanism for configuring the DNS search list. In this + respect the ".local." suffix is treated no differently from any other + search domain that might appear in the DNS search list. + + + + + + +Cheshire & Krochmal Standards Track [Page 6] + +RFC 6762 Multicast DNS February 2013 + + + DNS queries for names that do not end with ".local." MAY be sent to + the mDNS multicast address, if no other conventional DNS server is + available. This can allow hosts on the same link to continue + communicating using each other's globally unique DNS names during + network outages that disrupt communication with the greater Internet. + When resolving global names via local multicast, it is even more + important to use DNS Security Extensions (DNSSEC) [RFC4033] or other + security mechanisms to ensure that the response is trustworthy. + Resolving global names via local multicast is a contentious issue, + and this document does not discuss it further, instead concentrating + on the issue of resolving local names using DNS messages sent to a + multicast address. + + This document recommends a single flat namespace for dot-local host + names, (i.e., the names of DNS "A" and "AAAA" records, which map + names to IPv4 and IPv6 addresses), but other DNS record types (such + as those used by DNS-Based Service Discovery [RFC6763]) may contain + as many labels as appropriate for the desired usage, up to a maximum + of 255 bytes, plus a terminating zero byte at the end. Name length + issues are discussed further in Appendix C. + + Enforcing uniqueness of host names is probably desirable in the + common case, but this document does not mandate that. It is + permissible for a collection of coordinated hosts to agree to + maintain multiple DNS address records with the same name, possibly + for load-balancing or fault-tolerance reasons. This document does + not take a position on whether that is sensible. It is important + that both modes of operation be supported. The Multicast DNS + protocol allows hosts to verify and maintain unique names for + resource records where that behavior is desired, and it also allows + hosts to maintain multiple resource records with a single shared name + where that behavior is desired. This consideration applies to all + resource records, not just address records (host names). In summary: + It is required that the protocol have the ability to detect and + handle name conflicts, but it is not required that this ability be + used for every record. + +4. Reverse Address Mapping + + Like ".local.", the IPv4 and IPv6 reverse mapping domains are also + defined to be link-local: + + Any DNS query for a name ending with "254.169.in-addr.arpa." MUST + be sent to the mDNS IPv4 link-local multicast address 224.0.0.251 + or the mDNS IPv6 multicast address FF02::FB. Since names under + this domain correspond to IPv4 link-local addresses, it is logical + that the local link is the best place to find information + pertaining to those names. + + + +Cheshire & Krochmal Standards Track [Page 7] + +RFC 6762 Multicast DNS February 2013 + + + Likewise, any DNS query for a name within the reverse mapping + domains for IPv6 link-local addresses ("8.e.f.ip6.arpa.", + "9.e.f.ip6.arpa.", "a.e.f.ip6.arpa.", and "b.e.f.ip6.arpa.") MUST + be sent to the mDNS IPv6 link-local multicast address FF02::FB or + the mDNS IPv4 link-local multicast address 224.0.0.251. + +5. Querying + + There are two kinds of Multicast DNS queries: one-shot queries of the + kind made by legacy DNS resolvers, and continuous, ongoing Multicast + DNS queries made by fully compliant Multicast DNS queriers, which + support asynchronous operations including DNS-Based Service Discovery + [RFC6763]. + + Except in the rare case of a Multicast DNS responder that is + advertising only shared resource records and no unique records, a + Multicast DNS responder MUST also implement a Multicast DNS querier + so that it can first verify the uniqueness of those records before it + begins answering queries for them. + +5.1. One-Shot Multicast DNS Queries + + The most basic kind of Multicast DNS client may simply send standard + DNS queries blindly to 224.0.0.251:5353, without necessarily even + being aware of what a multicast address is. This change can + typically be implemented with just a few lines of code in an existing + DNS resolver library. If a name being queried falls within one of + the reserved Multicast DNS domains (see Sections 3 and 4), then, + rather than using the configured Unicast DNS server address, the + query is instead sent to 224.0.0.251:5353 (or its IPv6 equivalent + [FF02::FB]:5353). Typically, the timeout would also be shortened to + two or three seconds. It's possible to make a minimal Multicast DNS + resolver with only these simple changes. These queries are typically + done using a high-numbered ephemeral UDP source port, but regardless + of whether they are sent from a dynamic port or from a fixed port, + these queries MUST NOT be sent using UDP source port 5353, since + using UDP source port 5353 signals the presence of a fully compliant + Multicast DNS querier, as described below. + + A simple DNS resolver like this will typically just take the first + response it receives. It will not listen for additional UDP + responses, but in many instances this may not be a serious problem. + If a user types "http://MyPrinter.local." into their web browser, and + their simple DNS resolver just takes the first response it receives, + and the user gets to see the status and configuration web page for + their printer, then the protocol has met the user's needs in this + case. + + + + +Cheshire & Krochmal Standards Track [Page 8] + +RFC 6762 Multicast DNS February 2013 + + + While a basic DNS resolver like this may be adequate for simple host + name lookup, it may not get ideal behavior in other cases. + Additional refinements to create a fully compliant Multicast DNS + querier are described below. + +5.2. Continuous Multicast DNS Querying + + In one-shot queries, the underlying assumption is that the + transaction begins when the application issues a query, and ends when + the first response is received. There is another type of query + operation that is more asynchronous, in which having received one + response is not necessarily an indication that there will be no more + relevant responses, and the querying operation continues until no + further responses are required. Determining when no further + responses are required depends on the type of operation being + performed. If the operation is looking up the IPv4 and IPv6 + addresses of another host, then no further responses are required + once a successful connection has been made to one of those IPv4 or + IPv6 addresses. If the operation is browsing to present the user + with a list of DNS-SD services found on the network [RFC6763], then + no further responses are required once the user indicates this to the + user-interface software, e.g., by closing the network browsing window + that was displaying the list of discovered services. + + Imagine some hypothetical software that allows users to discover + network printers. The user wishes to discover all printers on the + local network, not only the printer that is quickest to respond. + When the user is actively looking for a network printer to use, they + open a network browsing window that displays the list of discovered + printers. It would be convenient for the user if they could rely on + this list of network printers to stay up to date as network printers + come and go, rather than displaying out-of-date stale information, + and requiring the user explicitly to click a "refresh" button any + time they want to see accurate information (which, from the moment it + is displayed, is itself already beginning to become out-of-date and + stale). If we are to display a continuously updated live list like + this, we need to be able to do it efficiently, without naive constant + polling, which would be an unreasonable burden on the network. It is + not expected that all users will be browsing to discover new printers + all the time, but when a user is browsing to discover service + instances for an extended period, we want to be able to support that + operation efficiently. + + Therefore, when retransmitting Multicast DNS queries to implement + this kind of continuous monitoring, the interval between the first + two queries MUST be at least one second, the intervals between + successive queries MUST increase by at least a factor of two, and the + querier MUST implement Known-Answer Suppression, as described below + + + +Cheshire & Krochmal Standards Track [Page 9] + +RFC 6762 Multicast DNS February 2013 + + + in Section 7.1. The Known-Answer Suppression mechanism tells + responders which answers are already known to the querier, thereby + allowing responders to avoid wasting network capacity with pointless + repeated transmission of those answers. A querier retransmits its + question because it wishes to receive answers it may have missed the + first time, not because it wants additional duplicate copies of + answers it already received. Failure to implement Known-Answer + Suppression can result in unacceptable levels of network traffic. + When the interval between queries reaches or exceeds 60 minutes, a + querier MAY cap the interval to a maximum of 60 minutes, and perform + subsequent queries at a steady-state rate of one query per hour. To + avoid accidental synchronization when, for some reason, multiple + clients begin querying at exactly the same moment (e.g., because of + some common external trigger event), a Multicast DNS querier SHOULD + also delay the first query of the series by a randomly chosen amount + in the range 20-120 ms. + + When a Multicast DNS querier receives an answer, the answer contains + a TTL value that indicates for how many seconds this answer is valid. + After this interval has passed, the answer will no longer be valid + and SHOULD be deleted from the cache. Before the record expiry time + is reached, a Multicast DNS querier that has local clients with an + active interest in the state of that record (e.g., a network browsing + window displaying a list of discovered services to the user) SHOULD + reissue its query to determine whether the record is still valid. + + To perform this cache maintenance, a Multicast DNS querier should + plan to retransmit its query after at least 50% of the record + lifetime has elapsed. This document recommends the following + specific strategy. + + The querier should plan to issue a query at 80% of the record + lifetime, and then if no answer is received, at 85%, 90%, and 95%. + If an answer is received, then the remaining TTL is reset to the + value given in the answer, and this process repeats for as long as + the Multicast DNS querier has an ongoing interest in the record. If + no answer is received after four queries, the record is deleted when + it reaches 100% of its lifetime. A Multicast DNS querier MUST NOT + perform this cache maintenance for records for which it has no local + clients with an active interest. If the expiry of a particular + record from the cache would result in no net effect to any client + software running on the querier device, and no visible effect to the + human user, then there is no reason for the Multicast DNS querier to + waste network capacity checking whether the record remains valid. + + + + + + + +Cheshire & Krochmal Standards Track [Page 10] + +RFC 6762 Multicast DNS February 2013 + + + To avoid the case where multiple Multicast DNS queriers on a network + all issue their queries simultaneously, a random variation of 2% of + the record TTL should be added, so that queries are scheduled to be + performed at 80-82%, 85-87%, 90-92%, and then 95-97% of the TTL. + + An additional efficiency optimization SHOULD be performed when a + Multicast DNS response is received containing a unique answer (as + indicated by the cache-flush bit being set, described in Section + 10.2, "Announcements to Flush Outdated Cache Entries"). In this + case, there is no need for the querier to continue issuing a stream + of queries with exponentially increasing intervals, since the receipt + of a unique answer is a good indication that no other answers will be + forthcoming. In this case, the Multicast DNS querier SHOULD plan to + issue its next query for this record at 80-82% of the record's TTL, + as described above. + + A compliant Multicast DNS querier, which implements the rules + specified in this document, MUST send its Multicast DNS queries from + UDP source port 5353 (the well-known port assigned to mDNS), and MUST + listen for Multicast DNS replies sent to UDP destination port 5353 at + the mDNS link-local multicast address (224.0.0.251 and/or its IPv6 + equivalent FF02::FB). + +5.3. Multiple Questions per Query + + Multicast DNS allows a querier to place multiple questions in the + Question Section of a single Multicast DNS query message. + + The semantics of a Multicast DNS query message containing multiple + questions is identical to a series of individual DNS query messages + containing one question each. Combining multiple questions into a + single message is purely an efficiency optimization and has no other + semantic significance. + +5.4. Questions Requesting Unicast Responses + + Sending Multicast DNS responses via multicast has the benefit that + all the other hosts on the network get to see those responses, + enabling them to keep their caches up to date and detect conflicting + responses. + + However, there are situations where all the other hosts on the + network don't need to see every response. Some examples are a laptop + computer waking from sleep, the Ethernet cable being connected to a + running machine, or a previously inactive interface being activated + through a configuration change. At the instant of wake-up or link + activation, the machine is a brand new participant on a new network. + Its Multicast DNS cache for that interface is empty, and it has no + + + +Cheshire & Krochmal Standards Track [Page 11] + +RFC 6762 Multicast DNS February 2013 + + + knowledge of its peers on that link. It may have a significant + number of questions that it wants answered right away, to discover + information about its new surroundings and present that information + to the user. As a new participant on the network, it has no idea + whether the exact same questions may have been asked and answered + just seconds ago. In this case, triggering a large sudden flood of + multicast responses may impose an unreasonable burden on the network. + + To avoid large floods of potentially unnecessary responses in these + cases, Multicast DNS defines the top bit in the class field of a DNS + question as the unicast-response bit. When this bit is set in a + question, it indicates that the querier is willing to accept unicast + replies in response to this specific query, as well as the usual + multicast responses. These questions requesting unicast responses + are referred to as "QU" questions, to distinguish them from the more + usual questions requesting multicast responses ("QM" questions). A + Multicast DNS querier sending its initial batch of questions + immediately on wake from sleep or interface activation SHOULD set the + unicast-response bit in those questions. + + When a question is retransmitted (as described in Section 5.2), the + unicast-response bit SHOULD NOT be set in subsequent retransmissions + of that question. Subsequent retransmissions SHOULD be usual "QM" + questions. After the first question has received its responses, the + querier should have a large Known-Answer list (Section 7.1) so that + subsequent queries should elicit few, if any, further responses. + Reverting to multicast responses as soon as possible is important + because of the benefits that multicast responses provide (see + Appendix D). In addition, the unicast-response bit SHOULD be set + only for questions that are active and ready to be sent the moment of + wake from sleep or interface activation. New questions created by + local clients afterwards should be treated as normal "QM" questions + and SHOULD NOT have the unicast-response bit set on the first + question of the series. + + When receiving a question with the unicast-response bit set, a + responder SHOULD usually respond with a unicast packet directed back + to the querier. However, if the responder has not multicast that + record recently (within one quarter of its TTL), then the responder + SHOULD instead multicast the response so as to keep all the peer + caches up to date, and to permit passive conflict detection. In the + case of answering a probe question (Section 8.1) with the unicast- + response bit set, the responder should always generate the requested + unicast response, but it may also send a multicast announcement if + the time since the last multicast announcement of that record is more + than a quarter of its TTL. + + + + + +Cheshire & Krochmal Standards Track [Page 12] + +RFC 6762 Multicast DNS February 2013 + + + Unicast replies are subject to all the same packet generation rules + as multicast replies, including the cache-flush bit (Section 10.2) + and (except when defending a unique name against a probe from another + host) randomized delays to reduce network collisions (Section 6). + +5.5. Direct Unicast Queries to Port 5353 + + In specialized applications there may be rare situations where it + makes sense for a Multicast DNS querier to send its query via unicast + to a specific machine. When a Multicast DNS responder receives a + query via direct unicast, it SHOULD respond as it would for "QU" + questions, as described above in Section 5.4. Since it is possible + for a unicast query to be received from a machine outside the local + link, responders SHOULD check that the source address in the query + packet matches the local subnet for that link (or, in the case of + IPv6, the source address has an on-link prefix) and silently ignore + the packet if not. + + There may be specialized situations, outside the scope of this + document, where it is intended and desirable to create a responder + that does answer queries originating outside the local link. Such a + responder would need to ensure that these non-local queries are + always answered via unicast back to the querier, since an answer sent + via link-local multicast would not reach a querier outside the local + link. + +6. Responding + + When a Multicast DNS responder constructs and sends a Multicast DNS + response message, the Resource Record Sections of that message must + contain only records for which that responder is explicitly + authoritative. These answers may be generated because the record + answers a question received in a Multicast DNS query message, or at + certain other times that the responder determines than an unsolicited + announcement is warranted. A Multicast DNS responder MUST NOT place + records from its cache, which have been learned from other responders + on the network, in the Resource Record Sections of outgoing response + messages. Only an authoritative source for a given record is allowed + to issue responses containing that record. + + The determination of whether a given record answers a given question + is made using the standard DNS rules: the record name must match the + question name, the record rrtype must match the question qtype unless + the qtype is "ANY" (255) or the rrtype is "CNAME" (5), and the record + rrclass must match the question qclass unless the qclass is "ANY" + (255). As with Unicast DNS, generally only DNS class 1 ("Internet") + is used, but should client software use classes other than 1, the + matching rules described above MUST be used. + + + +Cheshire & Krochmal Standards Track [Page 13] + +RFC 6762 Multicast DNS February 2013 + + + A Multicast DNS responder MUST only respond when it has a positive, + non-null response to send, or it authoritatively knows that a + particular record does not exist. For unique records, where the host + has already established sole ownership of the name, it MUST return + negative answers to queries for records that it knows not to exist. + For example, a host with no IPv6 address, that has claimed sole + ownership of the name "host.local." for all rrtypes, MUST respond to + AAAA queries for "host.local." by sending a negative answer + indicating that no AAAA records exist for that name. See Section + 6.1, "Negative Responses". For shared records, which are owned by no + single host, the nonexistence of a given record is ascertained by the + failure of any machine to respond to the Multicast DNS query, not by + any explicit negative response. For shared records, NXDOMAIN and + other error responses MUST NOT be sent. + + Multicast DNS responses MUST NOT contain any questions in the + Question Section. Any questions in the Question Section of a + received Multicast DNS response MUST be silently ignored. Multicast + DNS queriers receiving Multicast DNS responses do not care what + question elicited the response; they care only that the information + in the response is true and accurate. + + A Multicast DNS responder on Ethernet [IEEE.802.3] and similar shared + multiple access networks SHOULD have the capability of delaying its + responses by up to 500 ms, as described below. + + If a large number of Multicast DNS responders were all to respond + immediately to a particular query, a collision would be virtually + guaranteed. By imposing a small random delay, the number of + collisions is dramatically reduced. On a full-sized Ethernet using + the maximum cable lengths allowed and the maximum number of repeaters + allowed, an Ethernet frame is vulnerable to collisions during the + transmission of its first 256 bits. On 10 Mb/s Ethernet, this + equates to a vulnerable time window of 25.6 microseconds. On higher- + speed variants of Ethernet, the vulnerable time window is shorter. + + In the case where a Multicast DNS responder has good reason to + believe that it will be the only responder on the link that will send + a response (i.e., because it is able to answer every question in the + query message, and for all of those answer records it has previously + verified that the name, rrtype, and rrclass are unique on the link), + it SHOULD NOT impose any random delay before responding, and SHOULD + normally generate its response within at most 10 ms. In particular, + this applies to responding to probe queries with the unicast-response + bit set. Since receiving a probe query gives a clear indication that + some other responder is planning to start using this name in the very + near future, answering such probe queries to defend a unique record + is a high priority and needs to be done without delay. A probe query + + + +Cheshire & Krochmal Standards Track [Page 14] + +RFC 6762 Multicast DNS February 2013 + + + can be distinguished from a normal query by the fact that a probe + query contains a proposed record in the Authority Section that + answers the question in the Question Section (for more details, see + Section 8.2, "Simultaneous Probe Tiebreaking"). + + Responding without delay is appropriate for records like the address + record for a particular host name, when the host name has been + previously verified unique. Responding without delay is *not* + appropriate for things like looking up PTR records used for DNS-Based + Service Discovery [RFC6763], where a large number of responses may be + anticipated. + + In any case where there may be multiple responses, such as queries + where the answer is a member of a shared resource record set, each + responder SHOULD delay its response by a random amount of time + selected with uniform random distribution in the range 20-120 ms. + The reason for requiring that the delay be at least 20 ms is to + accommodate the situation where two or more query packets are sent + back-to-back, because in that case we want a responder with answers + to more than one of those queries to have the opportunity to + aggregate all of its answers into a single response message. + + In the case where the query has the TC (truncated) bit set, + indicating that subsequent Known-Answer packets will follow, + responders SHOULD delay their responses by a random amount of time + selected with uniform random distribution in the range 400-500 ms, to + allow enough time for all the Known-Answer packets to arrive, as + described in Section 7.2, "Multipacket Known-Answer Suppression". + + The source UDP port in all Multicast DNS responses MUST be 5353 (the + well-known port assigned to mDNS). Multicast DNS implementations + MUST silently ignore any Multicast DNS responses they receive where + the source UDP port is not 5353. + + The destination UDP port in all Multicast DNS responses MUST be 5353, + and the destination address MUST be the mDNS IPv4 link-local + multicast address 224.0.0.251 or its IPv6 equivalent FF02::FB, except + when generating a reply to a query that explicitly requested a + unicast response: + + * via the unicast-response bit, + * by virtue of being a legacy query (Section 6.7), or + * by virtue of being a direct unicast query. + + Except for these three specific cases, responses MUST NOT be sent via + unicast, because then the "Passive Observation of Failures" + mechanisms described in Section 10.5 would not work correctly. Other + + + + +Cheshire & Krochmal Standards Track [Page 15] + +RFC 6762 Multicast DNS February 2013 + + + benefits of sending responses via multicast are discussed in Appendix + D. A Multicast DNS querier MUST only accept unicast responses if + they answer a recently sent query (e.g., sent within the last two + seconds) that explicitly requested unicast responses. A Multicast + DNS querier MUST silently ignore all other unicast responses. + + To protect the network against excessive packet flooding due to + software bugs or malicious attack, a Multicast DNS responder MUST NOT + (except in the one special case of answering probe queries) multicast + a record on a given interface until at least one second has elapsed + since the last time that record was multicast on that particular + interface. A legitimate querier on the network should have seen the + previous transmission and cached it. A querier that did not receive + and cache the previous transmission will retry its request and + receive a subsequent response. In the special case of answering + probe queries, because of the limited time before the probing host + will make its decision about whether or not to use the name, a + Multicast DNS responder MUST respond quickly. In this special case + only, when responding via multicast to a probe, a Multicast DNS + responder is only required to delay its transmission as necessary to + ensure an interval of at least 250 ms since the last time the record + was multicast on that interface. + +6.1. Negative Responses + + In the early design of Multicast DNS it was assumed that explicit + negative responses would never be needed. A host can assert the + existence of the set of records that it claims to exist, and the + union of all such sets on a link is the set of Multicast DNS records + that exist on that link. Asserting the nonexistence of every record + in the complement of that set -- i.e., all possible Multicast DNS + records that could exist on this link but do not at this moment -- + was felt to be impractical and unnecessary. The nonexistence of a + record would be ascertained by a querier querying for it and failing + to receive a response from any of the hosts currently attached to the + link. + + However, operational experience showed that explicit negative + responses can sometimes be valuable. One such example is when a + querier is querying for a AAAA record, and the host name in question + has no associated IPv6 addresses. In this case, the responding host + knows it currently has exclusive ownership of that name, and it knows + that it currently does not have any IPv6 addresses, so an explicit + negative response is preferable to the querier having to retransmit + its query multiple times, and eventually give up with a timeout, + before it can conclude that a given AAAA record does not exist. + + + + + +Cheshire & Krochmal Standards Track [Page 16] + +RFC 6762 Multicast DNS February 2013 + + + Any time a responder receives a query for a name for which it has + verified exclusive ownership, for a type for which that name has no + records, the responder MUST (except as allowed in (a) below) respond + asserting the nonexistence of that record using a DNS NSEC record + [RFC4034]. In the case of Multicast DNS the NSEC record is not being + used for its usual DNSSEC [RFC4033] security properties, but simply + as a way of expressing which records do or do not exist with a given + name. + + On receipt of a question for a particular name, rrtype, and rrclass, + for which a responder does have one or more unique answers, the + responder MAY also include an NSEC record in the Additional Record + Section indicating the nonexistence of other rrtypes for that name + and rrclass. + + Implementers working with devices with sufficient memory and CPU + resources MAY choose to implement code to handle the full generality + of the DNS NSEC record [RFC4034], including bitmaps up to 65,536 bits + long. To facilitate use by devices with limited memory and CPU + resources, Multicast DNS queriers are only REQUIRED to be able to + parse a restricted form of the DNS NSEC record. All compliant + Multicast DNS implementations MUST at least correctly generate and + parse the restricted DNS NSEC record format described below: + + o The 'Next Domain Name' field contains the record's own name. + When used with name compression, this means that the 'Next + Domain Name' field always takes exactly two bytes in the + message. + + o The Type Bit Map block number is 0. + + o The Type Bit Map block length byte is a value in the range 1-32. + + o The Type Bit Map data is 1-32 bytes, as indicated by length + byte. + + Because this restricted form of the DNS NSEC record is limited to + Type Bit Map block number zero, it cannot express the existence of + rrtypes above 255. Consequently, if a Multicast DNS responder were + to have records with rrtypes above 255, it MUST NOT generate these + restricted-form NSEC records for those names, since to do so would + imply that the name has no records with rrtypes above 255, which + would be false. In such cases a Multicast DNS responder MUST either + (a) emit no NSEC record for that name, or (b) emit a full NSEC record + containing the appropriate Type Bit Map block(s) with the correct + bits set for all the record types that exist. In practice this is + not a significant limitation, since rrtypes above 255 are not + currently in widespread use. + + + +Cheshire & Krochmal Standards Track [Page 17] + +RFC 6762 Multicast DNS February 2013 + + + If a Multicast DNS implementation receives an NSEC record where the + 'Next Domain Name' field is not the record's own name, then the + implementation SHOULD ignore the 'Next Domain Name' field and process + the remainder of the NSEC record as usual. In Multicast DNS the + 'Next Domain Name' field is not currently used, but it could be used + in a future version of this protocol, which is why a Multicast DNS + implementation MUST NOT reject or ignore an NSEC record it receives + just because it finds an unexpected value in the 'Next Domain Name' + field. + + If a Multicast DNS implementation receives an NSEC record containing + more than one Type Bit Map, or where the Type Bit Map block number is + not zero, or where the block length is not in the range 1-32, then + the Multicast DNS implementation MAY silently ignore the entire NSEC + record. A Multicast DNS implementation MUST NOT ignore an entire + message just because that message contains one or more NSEC record(s) + that the Multicast DNS implementation cannot parse. This provision + is to allow future enhancements to the protocol to be introduced in a + backwards-compatible way that does not break compatibility with older + Multicast DNS implementations. + + To help differentiate these synthesized NSEC records (generated + programmatically on-the-fly) from conventional Unicast DNS NSEC + records (which actually exist in a signed DNS zone), the synthesized + Multicast DNS NSEC records MUST NOT have the NSEC bit set in the Type + Bit Map, whereas conventional Unicast DNS NSEC records do have the + NSEC bit set. + + The TTL of the NSEC record indicates the intended lifetime of the + negative cache entry. In general, the TTL given for an NSEC record + SHOULD be the same as the TTL that the record would have had, had it + existed. For example, the TTL for address records in Multicast DNS + is typically 120 seconds (see Section 10), so the negative cache + lifetime for an address record that does not exist should also be 120 + seconds. + + A responder MUST only generate negative responses to queries for + which it has legitimate ownership of the name, rrtype, and rrclass in + question, and can legitimately assert that no record with that name, + rrtype, and rrclass exists. A responder can assert that a specified + rrtype does not exist for one of its names if it knows a priori that + it has exclusive ownership of that name (e.g., names of reverse + address mapping PTR records, which are derived from IP addresses, + which should be unique on the local link) or if it previously claimed + unique ownership of that name using probe queries for rrtype "ANY". + (If it were to use probe queries for a specific rrtype, then it would + only own the name for that rrtype, and could not assert that other + rrtypes do not exist.) + + + +Cheshire & Krochmal Standards Track [Page 18] + +RFC 6762 Multicast DNS February 2013 + + + The design rationale for this mechanism for encoding negative + responses is discussed further in Appendix E. + +6.2. Responding to Address Queries + + When a Multicast DNS responder sends a Multicast DNS response message + containing its own address records, it MUST include all addresses + that are valid on the interface on which it is sending the message, + and MUST NOT include addresses that are not valid on that interface + (such as addresses that may be configured on the host's other + interfaces). For example, if an interface has both an IPv6 link- + local and an IPv6 routable address, both should be included in the + response message so that queriers receive both and can make their own + choice about which to use. This allows a querier that only has an + IPv6 link-local address to connect to the link-local address, and a + different querier that has an IPv6 routable address to connect to the + IPv6 routable address instead. + + When a Multicast DNS responder places an IPv4 or IPv6 address record + (rrtype "A" or "AAAA") into a response message, it SHOULD also place + any records of the other address type with the same name into the + additional section, if there is space in the message. This is to + provide fate sharing, so that all a device's addresses are delivered + atomically in a single message, to reduce the risk that packet loss + could cause a querier to receive only the IPv4 addresses and not the + IPv6 addresses, or vice versa. + + In the event that a device has only IPv4 addresses but no IPv6 + addresses, or vice versa, then the appropriate NSEC record SHOULD be + placed into the additional section, so that queriers can know with + certainty that the device has no addresses of that kind. + + Some Multicast DNS responders treat a physical interface with both + IPv4 and IPv6 address as a single interface with two addresses. + Other Multicast DNS responders may treat this case as logically two + interfaces (one with one or more IPv4 addresses, and the other with + one or more IPv6 addresses), but responders that operate this way + MUST NOT put the corresponding automatic NSEC records in replies they + send (i.e., a negative IPv4 assertion in their IPv6 responses, and a + negative IPv6 assertion in their IPv4 responses) because this would + cause incorrect operation in responders on the network that work the + former way. + +6.3. Responding to Multiquestion Queries + + Multicast DNS responders MUST correctly handle DNS query messages + containing more than one question, by answering any or all of the + questions to which they have answers. Unlike single-question + + + +Cheshire & Krochmal Standards Track [Page 19] + +RFC 6762 Multicast DNS February 2013 + + + queries, where responding without delay is allowed in appropriate + cases, for query messages containing more than one question, all + (non-defensive) answers SHOULD be randomly delayed in the range + 20-120 ms, or 400-500 ms if the TC (truncated) bit is set. This is + because when a query message contains more than one question, a + Multicast DNS responder cannot generally be certain that other + responders will not also be simultaneously generating answers to + other questions in that query message. (Answers defending a name, in + response to a probe for that name, are not subject to this delay rule + and are still sent immediately.) + +6.4. Response Aggregation + + When possible, a responder SHOULD, for the sake of network + efficiency, aggregate as many responses as possible into a single + Multicast DNS response message. For example, when a responder has + several responses it plans to send, each delayed by a different + interval, then earlier responses SHOULD be delayed by up to an + additional 500 ms if that will permit them to be aggregated with + other responses scheduled to go out a little later. + +6.5. Wildcard Queries (qtype "ANY" and qclass "ANY") + + When responding to queries using qtype "ANY" (255) and/or qclass + "ANY" (255), a Multicast DNS responder MUST respond with *ALL* of its + records that match the query. This is subtly different from how + qtype "ANY" and qclass "ANY" work in Unicast DNS. + + A common misconception is that a Unicast DNS query for qtype "ANY" + will elicit a response containing all matching records. This is + incorrect. If there are any records that match the query, the + response is required only to contain at least one of them, not + necessarily all of them. + + This somewhat surprising behavior is commonly seen with caching + (i.e., "recursive") name servers. If a caching server receives a + qtype "ANY" query for which it has at least one valid answer, it is + allowed to return only those matching answers it happens to have + already in its cache, and it is not required to reconsult the + authoritative name server to check if there are any more records that + also match the qtype "ANY" query. + + For example, one might imagine that a query for qtype "ANY" for name + "host.example.com" would return both the IPv4 (A) and the IPv6 (AAAA) + address records for that host. In reality, what happens is that it + depends on the history of what queries have been previously received + by intervening caching servers. If a caching server has no records + for "host.example.com", then it will consult another server (usually + + + +Cheshire & Krochmal Standards Track [Page 20] + +RFC 6762 Multicast DNS February 2013 + + + the authoritative name server for the name in question), and, in that + case, it will typically return all IPv4 and IPv6 address records. + However, if some other host has recently done a query for qtype "A" + for name "host.example.com", so that the caching server already has + IPv4 address records for "host.example.com" in its cache but no IPv6 + address records, then it will return only the IPv4 address records it + already has cached, and no IPv6 address records. + + Multicast DNS does not share this property that qtype "ANY" and + qclass "ANY" queries return some undefined subset of the matching + records. When responding to queries using qtype "ANY" (255) and/or + qclass "ANY" (255), a Multicast DNS responder MUST respond with *ALL* + of its records that match the query. + +6.6. Cooperating Multicast DNS Responders + + If a Multicast DNS responder ("A") observes some other Multicast DNS + responder ("B") send a Multicast DNS response message containing a + resource record with the same name, rrtype, and rrclass as one of A's + resource records, but *different* rdata, then: + + o If A's resource record is intended to be a shared resource + record, then this is no conflict, and no action is required. + + o If A's resource record is intended to be a member of a unique + resource record set owned solely by that responder, then this is + a conflict and MUST be handled as described in Section 9, + "Conflict Resolution". + + If a Multicast DNS responder ("A") observes some other Multicast DNS + responder ("B") send a Multicast DNS response message containing a + resource record with the same name, rrtype, and rrclass as one of A's + resource records, and *identical* rdata, then: + + o If the TTL of B's resource record given in the message is at + least half the true TTL from A's point of view, then no action + is required. + + o If the TTL of B's resource record given in the message is less + than half the true TTL from A's point of view, then A MUST mark + its record to be announced via multicast. Queriers receiving + the record from B would use the TTL given by B and, hence, may + delete the record sooner than A expects. By sending its own + multicast response correcting the TTL, A ensures that the record + will be retained for the desired time. + + + + + + +Cheshire & Krochmal Standards Track [Page 21] + +RFC 6762 Multicast DNS February 2013 + + + These rules allow multiple Multicast DNS responders to offer the same + data on the network (perhaps for fault-tolerance reasons) without + conflicting with each other. + +6.7. Legacy Unicast Responses + + If the source UDP port in a received Multicast DNS query is not port + 5353, this indicates that the querier originating the query is a + simple resolver such as described in Section 5.1, "One-Shot Multicast + DNS Queries", which does not fully implement all of Multicast DNS. + In this case, the Multicast DNS responder MUST send a UDP response + directly back to the querier, via unicast, to the query packet's + source IP address and port. This unicast response MUST be a + conventional unicast response as would be generated by a conventional + Unicast DNS server; for example, it MUST repeat the query ID and the + question given in the query message. In addition, the cache-flush + bit described in Section 10.2, "Announcements to Flush Outdated Cache + Entries", MUST NOT be set in legacy unicast responses. + + The resource record TTL given in a legacy unicast response SHOULD NOT + be greater than ten seconds, even if the true TTL of the Multicast + DNS resource record is higher. This is because Multicast DNS + responders that fully participate in the protocol use the cache + coherency mechanisms described in Section 10, "Resource Record TTL + Values and Cache Coherency", to update and invalidate stale data. + Were unicast responses sent to legacy resolvers to use the same high + TTLs, these legacy resolvers, which do not implement these cache + coherency mechanisms, could retain stale cached resource record data + long after it is no longer valid. + +7. Traffic Reduction + + A variety of techniques are used to reduce the amount of traffic on + the network. + +7.1. Known-Answer Suppression + + When a Multicast DNS querier sends a query to which it already knows + some answers, it populates the Answer Section of the DNS query + message with those answers. + + Generally, this applies only to Shared records, not Unique records, + since if a Multicast DNS querier already has at least one Unique + record in its cache then it should not be expecting further different + answers to this question, since the Unique record(s) it already has + comprise the complete answer, so it has no reason to be sending the + query at all. In contrast, having some Shared records in its cache + does not necessarily imply that a Multicast DNS querier will not + + + +Cheshire & Krochmal Standards Track [Page 22] + +RFC 6762 Multicast DNS February 2013 + + + receive further answers to this query, and it is in this case that it + is beneficial to use the Known-Answer list to suppress repeated + sending of redundant answers that the querier already knows. + + A Multicast DNS responder MUST NOT answer a Multicast DNS query if + the answer it would give is already included in the Answer Section + with an RR TTL at least half the correct value. If the RR TTL of the + answer as given in the Answer Section is less than half of the true + RR TTL as known by the Multicast DNS responder, the responder MUST + send an answer so as to update the querier's cache before the record + becomes in danger of expiration. + + Because a Multicast DNS responder will respond if the remaining TTL + given in the Known-Answer list is less than half the true TTL, it is + superfluous for the querier to include such records in the Known- + Answer list. Therefore, a Multicast DNS querier SHOULD NOT include + records in the Known-Answer list whose remaining TTL is less than + half of their original TTL. Doing so would simply consume space in + the message without achieving the goal of suppressing responses and + would, therefore, be a pointless waste of network capacity. + + A Multicast DNS querier MUST NOT cache resource records observed in + the Known-Answer Section of other Multicast DNS queries. The Answer + Section of Multicast DNS queries is not authoritative. By placing + information in the Answer Section of a Multicast DNS query, the + querier is stating that it *believes* the information to be true. It + is not asserting that the information *is* true. Some of those + records may have come from other hosts that are no longer on the + network. Propagating that stale information to other Multicast DNS + queriers on the network would not be helpful. + +7.2. Multipacket Known-Answer Suppression + + Sometimes a Multicast DNS querier will already have too many answers + to fit in the Known-Answer Section of its query packets. In this + case, it should issue a Multicast DNS query containing a question and + as many Known-Answer records as will fit. It MUST then set the TC + (Truncated) bit in the header before sending the query. It MUST + immediately follow the packet with another query packet containing no + questions and as many more Known-Answer records as will fit. If + there are still too many records remaining to fit in the packet, it + again sets the TC bit and continues until all the Known-Answer + records have been sent. + + A Multicast DNS responder seeing a Multicast DNS query with the TC + bit set defers its response for a time period randomly selected in + the interval 400-500 ms. This gives the Multicast DNS querier time + to send additional Known-Answer packets before the responder + + + +Cheshire & Krochmal Standards Track [Page 23] + +RFC 6762 Multicast DNS February 2013 + + + responds. If the responder sees any of its answers listed in the + Known-Answer lists of subsequent packets from the querying host, it + MUST delete that answer from the list of answers it is planning to + give (provided that no other host on the network has also issued a + query for that record and is waiting to receive an answer). + + If the responder receives additional Known-Answer packets with the TC + bit set, it SHOULD extend the delay as necessary to ensure a pause of + 400-500 ms after the last such packet before it sends its answer. + This opens the potential risk that a continuous stream of Known- + Answer packets could, theoretically, prevent a responder from + answering indefinitely. In practice, answers are never actually + delayed significantly, and should a situation arise where significant + delays did happen, that would be a scenario where the network is so + overloaded that it would be desirable to err on the side of caution. + The consequence of delaying an answer may be that it takes a user + longer than usual to discover all the services on the local network; + in contrast, the consequence of incorrectly answering before all the + Known-Answer packets have been received would be wasted capacity + sending unnecessary answers on an already overloaded network. In + this (rare) situation, sacrificing speed to preserve reliable network + operation is the right trade-off. + +7.3. Duplicate Question Suppression + + If a host is planning to transmit (or retransmit) a query, and it + sees another host on the network send a query containing the same + "QM" question, and the Known-Answer Section of that query does not + contain any records that this host would not also put in its own + Known-Answer Section, then this host SHOULD treat its own query as + having been sent. When multiple queriers on the network are querying + for the same resource records, there is no need for them to all be + repeatedly asking the same question. + +7.4. Duplicate Answer Suppression + + If a host is planning to send an answer, and it sees another host on + the network send a response message containing the same answer + record, and the TTL in that record is not less than the TTL this host + would have given, then this host SHOULD treat its own answer as + having been sent, and not also send an identical answer itself. When + multiple responders on the network have the same data, there is no + need for all of them to respond. + + + + + + + + +Cheshire & Krochmal Standards Track [Page 24] + +RFC 6762 Multicast DNS February 2013 + + + The opportunity for duplicate answer suppression occurs when a host + has received a query, and is delaying its response for some pseudo- + random interval up to 500 ms, as described elsewhere in this + document, and then, before the host sends its response, it sees some + other host on the network send a response message containing the same + answer record. + + This feature is particularly useful when Multicast DNS Proxy Servers + are in use, where there could be more than one proxy on the network + giving Multicast DNS answers on behalf of some other host (e.g., + because that other host is currently asleep and is not itself + responding to queries). + +8. Probing and Announcing on Startup + + Typically a Multicast DNS responder should have, at the very least, + address records for all of its active interfaces. Creating and + advertising an HINFO record on each interface as well can be useful + to network administrators. + + Whenever a Multicast DNS responder starts up, wakes up from sleep, + receives an indication of a network interface "Link Change" event, or + has any other reason to believe that its network connectivity may + have changed in some relevant way, it MUST perform the two startup + steps below: Probing (Section 8.1) and Announcing (Section 8.3). + +8.1. Probing + + The first startup step is that, for all those resource records that a + Multicast DNS responder desires to be unique on the local link, it + MUST send a Multicast DNS query asking for those resource records, to + see if any of them are already in use. The primary example of this + is a host's address records, which map its unique host name to its + unique IPv4 and/or IPv6 addresses. All probe queries SHOULD be done + using the desired resource record name and class (usually class 1, + "Internet"), and query type "ANY" (255), to elicit answers for all + types of records with that name. This allows a single question to be + used in place of several questions, which is more efficient on the + network. It also allows a host to verify exclusive ownership of a + name for all rrtypes, which is desirable in most cases. It would be + confusing, for example, if one host owned the "A" record for + "myhost.local.", but a different host owned the "AAAA" record for + that name. + + + + + + + + +Cheshire & Krochmal Standards Track [Page 25] + +RFC 6762 Multicast DNS February 2013 + + + The ability to place more than one question in a Multicast DNS query + is useful here, because it can allow a host to use a single message + to probe for all of its resource records instead of needing a + separate message for each. For example, a host can simultaneously + probe for uniqueness of its "A" record and all its SRV records + [RFC6763] in the same query message. + + When ready to send its Multicast DNS probe packet(s) the host should + first wait for a short random delay time, uniformly distributed in + the range 0-250 ms. This random delay is to guard against the case + where several devices are powered on simultaneously, or several + devices are connected to an Ethernet hub, which is then powered on, + or some other external event happens that might cause a group of + hosts to all send synchronized probes. + + 250 ms after the first query, the host should send a second; then, + 250 ms after that, a third. If, by 250 ms after the third probe, no + conflicting Multicast DNS responses have been received, the host may + move to the next step, announcing. (Note that probing is the one + exception from the normal rule that there should be at least one + second between repetitions of the same question, and the interval + between subsequent repetitions should at least double.) + + When sending probe queries, a host MUST NOT consult its cache for + potential answers. Only conflicting Multicast DNS responses received + "live" from the network are considered valid for the purposes of + determining whether probing has succeeded or failed. + + In order to allow services to announce their presence without + unreasonable delay, the time window for probing is intentionally set + quite short. As a result of this, from the time the first probe + packet is sent, another device on the network using that name has + just 750 ms to respond to defend its name. On networks that are + slow, or busy, or both, it is possible for round-trip latency to + account for a few hundred milliseconds, and software delays in slow + devices can add additional delay. Hence, it is important that when a + device receives a probe query for a name that it is currently using, + it SHOULD generate its response to defend that name immediately and + send it as quickly as possible. The usual rules about random delays + before responding, to avoid sudden bursts of simultaneous answers + from different hosts, do not apply here since normally at most one + host should ever respond to a given probe question. Even when a + single DNS query message contains multiple probe questions, it would + be unusual for that message to elicit a defensive response from more + than one other host. Because of the mDNS multicast rate-limiting + + + + + + +Cheshire & Krochmal Standards Track [Page 26] + +RFC 6762 Multicast DNS February 2013 + + + rules, the probes SHOULD be sent as "QU" questions with the unicast- + response bit set, to allow a defending host to respond immediately + via unicast, instead of potentially having to wait before replying + via multicast. + + During probing, from the time the first probe packet is sent until + 250 ms after the third probe, if any conflicting Multicast DNS + response is received, then the probing host MUST defer to the + existing host, and SHOULD choose new names for some or all of its + resource records as appropriate. Apparently conflicting Multicast + DNS responses received *before* the first probe packet is sent MUST + be silently ignored (see discussion of stale probe packets in Section + 8.2, "Simultaneous Probe Tiebreaking", below). In the case of a host + probing using query type "ANY" as recommended above, any answer + containing a record with that name, of any type, MUST be considered a + conflicting response and handled accordingly. + + If fifteen conflicts occur within any ten-second period, then the + host MUST wait at least five seconds before each successive + additional probe attempt. This is to help ensure that, in the event + of software bugs or other unanticipated problems, errant hosts do not + flood the network with a continuous stream of multicast traffic. For + very simple devices, a valid way to comply with this requirement is + to always wait five seconds after any failed probe attempt before + trying again. + + If a responder knows by other means that its unique resource record + set name, rrtype, and rrclass cannot already be in use by any other + responder on the network, then it SHOULD skip the probing step for + that resource record set. For example, when creating the reverse + address mapping PTR records, the host can reasonably assume that no + other host will be trying to create those same PTR records, since + that would imply that the two hosts were trying to use the same IP + address, and if that were the case, the two hosts would be suffering + communication problems beyond the scope of what Multicast DNS is + designed to solve. Similarly, if a responder is acting as a proxy, + taking over from another Multicast DNS responder that has already + verified the uniqueness of the record, then the proxy SHOULD NOT + repeat the probing step for those records. + +8.2. Simultaneous Probe Tiebreaking + + The astute reader will observe that there is a race condition + inherent in the previous description. If two hosts are probing for + the same name simultaneously, neither will receive any response to + the probe, and the hosts could incorrectly conclude that they may + both proceed to use the name. To break this symmetry, each host + populates the query message's Authority Section with the record or + + + +Cheshire & Krochmal Standards Track [Page 27] + +RFC 6762 Multicast DNS February 2013 + + + records with the rdata that it would be proposing to use, should its + probing be successful. The Authority Section is being used here in a + way analogous to the way it is used as the "Update Section" in a DNS + Update message [RFC2136] [RFC3007]. + + When a host is probing for a group of related records with the same + name (e.g., the SRV and TXT record describing a DNS-SD service), only + a single question need be placed in the Question Section, since query + type "ANY" (255) is used, which will elicit answers for all records + with that name. However, for tiebreaking to work correctly in all + cases, the Authority Section must contain *all* the records and + proposed rdata being probed for uniqueness. + + When a host that is probing for a record sees another host issue a + query for the same record, it consults the Authority Section of that + query. If it finds any resource record(s) there which answers the + query, then it compares the data of that (those) resource record(s) + with its own tentative data. We consider first the simple case of a + host probing for a single record, receiving a simultaneous probe from + another host also probing for a single record. The two records are + compared and the lexicographically later data wins. This means that + if the host finds that its own data is lexicographically later, it + simply ignores the other host's probe. If the host finds that its + own data is lexicographically earlier, then it defers to the winning + host by waiting one second, and then begins probing for this record + again. The logic for waiting one second and then trying again is to + guard against stale probe packets on the network (possibly even stale + probe packets sent moments ago by this host itself, before some + configuration change, which may be echoed back after a short delay by + some Ethernet switches and some 802.11 base stations). If the + winning simultaneous probe was from a real other host on the network, + then after one second it will have completed its probing, and will + answer subsequent probes. If the apparently winning simultaneous + probe was in fact just an old stale packet on the network (maybe from + the host itself), then when it retries its probing in one second, its + probes will go unanswered, and it will successfully claim the name. + + The determination of "lexicographically later" is performed by first + comparing the record class (excluding the cache-flush bit described + in Section 10.2), then the record type, then raw comparison of the + binary content of the rdata without regard for meaning or structure. + If the record classes differ, then the numerically greater class is + considered "lexicographically later". Otherwise, if the record types + differ, then the numerically greater type is considered + "lexicographically later". If the rrtype and rrclass both match, + then the rdata is compared. + + + + + +Cheshire & Krochmal Standards Track [Page 28] + +RFC 6762 Multicast DNS February 2013 + + + In the case of resource records containing rdata that is subject to + name compression [RFC1035], the names MUST be uncompressed before + comparison. (The details of how a particular name is compressed is + an artifact of how and where the record is written into the DNS + message; it is not an intrinsic property of the resource record + itself.) + + The bytes of the raw uncompressed rdata are compared in turn, + interpreting the bytes as eight-bit UNSIGNED values, until a byte is + found whose value is greater than that of its counterpart (in which + case, the rdata whose byte has the greater value is deemed + lexicographically later) or one of the resource records runs out of + rdata (in which case, the resource record which still has remaining + data first is deemed lexicographically later). The following is an + example of a conflict: + + MyPrinter.local. A 169.254.99.200 + MyPrinter.local. A 169.254.200.50 + + In this case, 169.254.200.50 is lexicographically later (the third + byte, with value 200, is greater than its counterpart with value 99), + so it is deemed the winner. + + Note that it is vital that the bytes are interpreted as UNSIGNED + values in the range 0-255, or the wrong outcome may result. In the + example above, if the byte with value 200 had been incorrectly + interpreted as a signed eight-bit value, then it would be interpreted + as value -56, and the wrong address record would be deemed the + winner. + +8.2.1. Simultaneous Probe Tiebreaking for Multiple Records + + When a host is probing for a set of records with the same name, or a + message is received containing multiple tiebreaker records answering + a given probe question in the Question Section, the host's records + and the tiebreaker records from the message are each sorted into + order, and then compared pairwise, using the same comparison + technique described above, until a difference is found. + + The records are sorted using the same lexicographical order as + described above, that is, if the record classes differ, the record + with the lower class number comes first. If the classes are the same + but the rrtypes differ, the record with the lower rrtype number comes + first. If the class and rrtype match, then the rdata is compared + bytewise until a difference is found. For example, in the common + case of advertising DNS-SD services with a TXT record and an SRV + record, the TXT record comes first (the rrtype value for TXT is 16) + and the SRV record comes second (the rrtype value for SRV is 33). + + + +Cheshire & Krochmal Standards Track [Page 29] + +RFC 6762 Multicast DNS February 2013 + + + When comparing the records, if the first records match perfectly, + then the second records are compared, and so on. If either list of + records runs out of records before any difference is found, then the + list with records remaining is deemed to have won the tiebreak. If + both lists run out of records at the same time without any difference + being found, then this indicates that two devices are advertising + identical sets of records, as is sometimes done for fault tolerance, + and there is, in fact, no conflict. + +8.3. Announcing + + The second startup step is that the Multicast DNS responder MUST send + an unsolicited Multicast DNS response containing, in the Answer + Section, all of its newly registered resource records (both shared + records, and unique records that have completed the probing step). + If there are too many resource records to fit in a single packet, + multiple packets should be used. + + In the case of shared records (e.g., the PTR records used by DNS- + Based Service Discovery [RFC6763]), the records are simply placed as + is into the Answer Section of the DNS response. + + In the case of records that have been verified to be unique in the + previous step, they are placed into the Answer Section of the DNS + response with the most significant bit of the rrclass set to one. + The most significant bit of the rrclass for a record in the Answer + Section of a response message is the Multicast DNS cache-flush bit + and is discussed in more detail below in Section 10.2, "Announcements + to Flush Outdated Cache Entries". + + The Multicast DNS responder MUST send at least two unsolicited + responses, one second apart. To provide increased robustness against + packet loss, a responder MAY send up to eight unsolicited responses, + provided that the interval between unsolicited responses increases by + at least a factor of two with every response sent. + + A Multicast DNS responder MUST NOT send announcements in the absence + of information that its network connectivity may have changed in some + relevant way. In particular, a Multicast DNS responder MUST NOT send + regular periodic announcements as a matter of course. + + Whenever a Multicast DNS responder receives any Multicast DNS + response (solicited or otherwise) containing a conflicting resource + record, the conflict MUST be resolved as described in Section 9, + "Conflict Resolution". + + + + + + +Cheshire & Krochmal Standards Track [Page 30] + +RFC 6762 Multicast DNS February 2013 + + +8.4. Updating + + At any time, if the rdata of any of a host's Multicast DNS records + changes, the host MUST repeat the Announcing step described above to + update neighboring caches. For example, if any of a host's IP + addresses change, it MUST re-announce those address records. The + host does not need to repeat the Probing step because it has already + established unique ownership of that name. + + In the case of shared records, a host MUST send a "goodbye" + announcement with RR TTL zero (see Section 10.1, "Goodbye Packets") + for the old rdata, to cause it to be deleted from peer caches, before + announcing the new rdata. In the case of unique records, a host + SHOULD omit the "goodbye" announcement, since the cache-flush bit on + the newly announced records will cause old rdata to be flushed from + peer caches anyway. + + A host may update the contents of any of its records at any time, + though a host SHOULD NOT update records more frequently than ten + times per minute. Frequent rapid updates impose a burden on the + network. If a host has information to disseminate which changes more + frequently than ten times per minute, then it may be more appropriate + to design a protocol for that specific purpose. + +9. Conflict Resolution + + A conflict occurs when a Multicast DNS responder has a unique record + for which it is currently authoritative, and it receives a Multicast + DNS response message containing a record with the same name, rrtype + and rrclass, but inconsistent rdata. What may be considered + inconsistent is context sensitive, except that resource records with + identical rdata are never considered inconsistent, even if they + originate from different hosts. This is to permit use of proxies and + other fault-tolerance mechanisms that may cause more than one + responder to be capable of issuing identical answers on the network. + + A common example of a resource record type that is intended to be + unique, not shared between hosts, is the address record that maps a + host's name to its IP address. Should a host witness another host + announce an address record with the same name but a different IP + address, then that is considered inconsistent, and that address + record is considered to be in conflict. + + Whenever a Multicast DNS responder receives any Multicast DNS + response (solicited or otherwise) containing a conflicting resource + record in any of the Resource Record Sections, the Multicast DNS + responder MUST immediately reset its conflicted unique record to + probing state, and go through the startup steps described above in + + + +Cheshire & Krochmal Standards Track [Page 31] + +RFC 6762 Multicast DNS February 2013 + + + Section 8, "Probing and Announcing on Startup". The protocol used in + the Probing phase will determine a winner and a loser, and the loser + MUST cease using the name, and reconfigure. + + It is very important that any host receiving a resource record that + conflicts with one of its own MUST take action as described above. + In the case of two hosts using the same host name, where one has been + configured to require a unique host name and the other has not, the + one that has not been configured to require a unique host name will + not perceive any conflict, and will not take any action. By + reverting to Probing state, the host that desires a unique host name + will go through the necessary steps to ensure that a unique host name + is obtained. + + The recommended course of action after probing and failing is as + follows: + + 1. Programmatically change the resource record name in an attempt + to find a new name that is unique. This could be done by + adding some further identifying information (e.g., the model + name of the hardware) if it is not already present in the name, + or appending the digit "2" to the name, or incrementing a + number at the end of the name if one is already present. + + 2. Probe again, and repeat as necessary until a unique name is + found. + + 3. Once an available unique name has been determined, by probing + without receiving any conflicting response, record this newly + chosen name in persistent storage so that the device will use + the same name the next time it is power-cycled. + + 4. Display a message to the user or operator informing them of the + name change. For example: + + The name "Bob's Music" is in use by another music server on + the network. Your music collection has been renamed to + "Bob's Music (2)". If you want to change this name, use + [describe appropriate menu item or preference dialog here]. + + The details of how the user or operator is informed of the new + name depends on context. A desktop computer with a screen + might put up a dialog box. A headless server in the closet may + write a message to a log file, or use whatever mechanism + (email, SNMP trap, etc.) it uses to inform the administrator of + error conditions. On the other hand, a headless server in the + closet may not inform the user at all -- if the user cares, + + + + +Cheshire & Krochmal Standards Track [Page 32] + +RFC 6762 Multicast DNS February 2013 + + + they will notice the name has changed, and connect to the + server in the usual way (e.g., via web browser) to configure a + new name. + + 5. After one minute of probing, if the Multicast DNS responder has + been unable to find any unused name, it should log an error + message to inform the user or operator of this fact. This + situation should never occur in normal operation. The only + situations that would cause this to happen would be either a + deliberate denial-of-service attack, or some kind of very + obscure hardware or software bug that acts like a deliberate + denial-of-service attack. + + These considerations apply to address records (i.e., host names) and + to all resource records where uniqueness (or maintenance of some + other defined constraint) is desired. + +10. Resource Record TTL Values and Cache Coherency + + As a general rule, the recommended TTL value for Multicast DNS + resource records with a host name as the resource record's name + (e.g., A, AAAA, HINFO) or a host name contained within the resource + record's rdata (e.g., SRV, reverse mapping PTR record) SHOULD be 120 + seconds. + + The recommended TTL value for other Multicast DNS resource records is + 75 minutes. + + A querier with an active outstanding query will issue a query message + when one or more of the resource records in its cache are 80% of the + way to expiry. If the TTL on those records is 75 minutes, this + ongoing cache maintenance process yields a steady-state query rate of + one query every 60 minutes. + + Any distributed cache needs a cache coherency protocol. If Multicast + DNS resource records follow the recommendation and have a TTL of 75 + minutes, that means that stale data could persist in the system for a + little over an hour. Making the default RR TTL significantly lower + would reduce the lifetime of stale data, but would produce too much + extra traffic on the network. Various techniques are available to + minimize the impact of such stale data, outlined in the five + subsections below. + +10.1. Goodbye Packets + + In the case where a host knows that certain resource record data is + about to become invalid (for example, when the host is undergoing a + clean shutdown), the host SHOULD send an unsolicited Multicast DNS + + + +Cheshire & Krochmal Standards Track [Page 33] + +RFC 6762 Multicast DNS February 2013 + + + response packet, giving the same resource record name, rrtype, + rrclass, and rdata, but an RR TTL of zero. This has the effect of + updating the TTL stored in neighboring hosts' cache entries to zero, + causing that cache entry to be promptly deleted. + + Queriers receiving a Multicast DNS response with a TTL of zero SHOULD + NOT immediately delete the record from the cache, but instead record + a TTL of 1 and then delete the record one second later. In the case + of multiple Multicast DNS responders on the network described in + Section 6.6 above, if one of the responders shuts down and + incorrectly sends goodbye packets for its records, it gives the other + cooperating responders one second to send out their own response to + "rescue" the records before they expire and are deleted. + +10.2. Announcements to Flush Outdated Cache Entries + + Whenever a host has a resource record with new data, or with what + might potentially be new data (e.g., after rebooting, waking from + sleep, connecting to a new network link, or changing IP address), the + host needs to inform peers of that new data. In cases where the host + has not been continuously connected and participating on the network + link, it MUST first probe to re-verify uniqueness of its unique + records, as described above in Section 8.1, "Probing". + + Having completed the Probing step, if necessary, the host MUST then + send a series of unsolicited announcements to update cache entries in + its neighbor hosts. In these unsolicited announcements, if the + record is one that has been verified unique, the host sets the most + significant bit of the rrclass field of the resource record. This + bit, the cache-flush bit, tells neighboring hosts that this is not a + shared record type. Instead of merging this new record additively + into the cache in addition to any previous records with the same + name, rrtype, and rrclass, all old records with that name, rrtype, + and rrclass that were received more than one second ago are declared + invalid, and marked to expire from the cache in one second. + + The semantics of the cache-flush bit are as follows: normally when a + resource record appears in a Resource Record Section of the DNS + response it means, "This is an assertion that this information is + true". When a resource record appears in a Resource Record Section + of the DNS response with the cache-flush bit set, it means, "This is + an assertion that this information is the truth and the whole truth, + and anything you may have heard more than a second ago regarding + records of this name/rrtype/rrclass is no longer true". + + To accommodate the case where the set of records from one host + constituting a single unique RRSet is too large to fit in a single + packet, only cache records that are more than one second old are + + + +Cheshire & Krochmal Standards Track [Page 34] + +RFC 6762 Multicast DNS February 2013 + + + flushed. This allows the announcing host to generate a quick burst + of packets back-to-back on the wire containing all the members of the + RRSet. When receiving records with the cache-flush bit set, all + records older than one second are marked to be deleted one second in + the future. One second after the end of the little packet burst, any + records not represented within that packet burst will then be expired + from all peer caches. + + Any time a host sends a response packet containing some members of a + unique RRSet, it MUST send the entire RRSet, preferably in a single + packet, or if the entire RRSet will not fit in a single packet, in a + quick burst of packets sent as close together as possible. The host + MUST set the cache-flush bit on all members of the unique RRSet. + + Another reason for waiting one second before deleting stale records + from the cache is to accommodate bridged networks. For example, a + host's address record announcement on a wireless interface may be + bridged onto a wired Ethernet and may cause that same host's Ethernet + address records to be flushed from peer caches. The one-second delay + gives the host the chance to see its own announcement arrive on the + wired Ethernet, and immediately re-announce its Ethernet interface's + address records so that both sets remain valid and live in peer + caches. + + These rules, about when to set the cache-flush bit and about sending + the entire rrset, apply regardless of *why* the response message is + being generated. They apply to startup announcements as described in + Section 8.3, "Announcing", and to responses generated as a result of + receiving query messages. + + The cache-flush bit is only set in records in the Resource Record + Sections of Multicast DNS responses sent to UDP port 5353. + + The cache-flush bit MUST NOT be set in any resource records in a + response message sent in legacy unicast responses to UDP ports other + than 5353. + + The cache-flush bit MUST NOT be set in any resource records in the + Known-Answer list of any query message. + + The cache-flush bit MUST NOT ever be set in any shared resource + record. To do so would cause all the other shared versions of this + resource record with different rdata from different responders to be + immediately deleted from all the caches on the network. + + + + + + + +Cheshire & Krochmal Standards Track [Page 35] + +RFC 6762 Multicast DNS February 2013 + + + The cache-flush bit does *not* apply to questions listed in the + Question Section of a Multicast DNS message. The top bit of the + rrclass field in questions is used for an entirely different purpose + (see Section 5.4, "Questions Requesting Unicast Responses"). + + Note that the cache-flush bit is NOT part of the resource record + class. The cache-flush bit is the most significant bit of the second + 16-bit word of a resource record in a Resource Record Section of a + Multicast DNS message (the field conventionally referred to as the + rrclass field), and the actual resource record class is the least + significant fifteen bits of this field. There is no Multicast DNS + resource record class 0x8001. The value 0x8001 in the rrclass field + of a resource record in a Multicast DNS response message indicates a + resource record with class 1, with the cache-flush bit set. When + receiving a resource record with the cache-flush bit set, + implementations should take care to mask off that bit before storing + the resource record in memory, or otherwise ensure that it is given + the correct semantic interpretation. + + The reuse of the top bit of the rrclass field only applies to + conventional resource record types that are subject to caching, not + to pseudo-RRs like OPT [RFC2671], TSIG [RFC2845], TKEY [RFC2930], + SIG0 [RFC2931], etc., that pertain only to a particular transport + level message and not to any actual DNS data. Since pseudo-RRs + should never go into the Multicast DNS cache, the concept of a cache- + flush bit for these types is not applicable. In particular, the + rrclass field of an OPT record encodes the sender's UDP payload size, + and should be interpreted as a sixteen-bit length value in the range + 0-65535, not a one-bit flag and a fifteen-bit length. + +10.3. Cache Flush on Topology change + + If the hardware on a given host is able to indicate physical changes + of connectivity, then when the hardware indicates such a change, the + host should take this information into account in its Multicast DNS + cache management strategy. For example, a host may choose to + immediately flush all cache records received on a particular + interface when that cable is disconnected. Alternatively, a host may + choose to adjust the remaining TTL on all those records to a few + seconds so that if the cable is not reconnected quickly, those + records will expire from the cache. + + Likewise, when a host reboots, wakes from sleep, or undergoes some + other similar discontinuous state change, the cache management + strategy should take that information into account. + + + + + + +Cheshire & Krochmal Standards Track [Page 36] + +RFC 6762 Multicast DNS February 2013 + + +10.4. Cache Flush on Failure Indication + + Sometimes a cache record can be determined to be stale when a client + attempts to use the rdata it contains, and the client finds that + rdata to be incorrect. + + For example, the rdata in an address record can be determined to be + incorrect if attempts to contact that host fail, either because (for + an IPv4 address on a local subnet) ARP requests for that address go + unanswered, because (for an IPv6 address with an on-link prefix) ND + requests for that address go unanswered, or because (for an address + on a remote network) a router returns an ICMP "Host Unreachable" + error. + + The rdata in an SRV record can be determined to be incorrect if + attempts to communicate with the indicated service at the host and + port number indicated are not successful. + + The rdata in a DNS-SD PTR record can be determined to be incorrect if + attempts to look up the SRV record it references are not successful. + + The software implementing the Multicast DNS resource record cache + should provide a mechanism so that clients detecting stale rdata can + inform the cache. + + When the cache receives this hint that it should reconfirm some + record, it MUST issue two or more queries for the resource record in + dispute. If no response is received within ten seconds, then, even + though its TTL may indicate that it is not yet due to expire, that + record SHOULD be promptly flushed from the cache. + + The end result of this is that if a printer suffers a sudden power + failure or other abrupt disconnection from the network, its name may + continue to appear in DNS-SD browser lists displayed on users' + screens. Eventually, that entry will expire from the cache + naturally, but if a user tries to access the printer before that + happens, the failure to successfully contact the printer will trigger + the more hasty demise of its cache entries. This is a sensible + trade-off between good user experience and good network efficiency. + If we were to insist that printers should disappear from the printer + list within 30 seconds of becoming unavailable, for all failure + modes, the only way to achieve this would be for the client to poll + the printer at least every 30 seconds, or for the printer to announce + its presence at least every 30 seconds, both of which would be an + unreasonable burden on most networks. + + + + + + +Cheshire & Krochmal Standards Track [Page 37] + +RFC 6762 Multicast DNS February 2013 + + +10.5. Passive Observation Of Failures (POOF) + + A host observes the multicast queries issued by the other hosts on + the network. One of the major benefits of also sending responses + using multicast is that it allows all hosts to see the responses (or + lack thereof) to those queries. + + If a host sees queries, for which a record in its cache would be + expected to be given as an answer in a multicast response, but no + such answer is seen, then the host may take this as an indication + that the record may no longer be valid. + + After seeing two or more of these queries, and seeing no multicast + response containing the expected answer within ten seconds, then even + though its TTL may indicate that it is not yet due to expire, that + record SHOULD be flushed from the cache. The host SHOULD NOT perform + its own queries to reconfirm that the record is truly gone. If every + host on a large network were to do this, it would cause a lot of + unnecessary multicast traffic. If host A sends multicast queries + that remain unanswered, then there is no reason to suppose that host + B or any other host is likely to be any more successful. + + The previous section, "Cache Flush on Failure Indication", describes + a situation where a user trying to print discovers that the printer + is no longer available. By implementing the passive observation + described here, when one user fails to contact the printer, all hosts + on the network observe that failure and update their caches + accordingly. + +11. Source Address Check + + All Multicast DNS responses (including responses sent via unicast) + SHOULD be sent with IP TTL set to 255. This is recommended to + provide backwards-compatibility with older Multicast DNS queriers + (implementing a draft version of this document, posted in February + 2004) that check the IP TTL on reception to determine whether the + packet originated on the local link. These older queriers discard + all packets with TTLs other than 255. + + A host sending Multicast DNS queries to a link-local destination + address (including the 224.0.0.251 and FF02::FB link-local multicast + addresses) MUST only accept responses to that query that originate + from the local link, and silently discard any other response packets. + Without this check, it could be possible for remote rogue hosts to + send spoof answer packets (perhaps unicast to the victim host), which + the receiving machine could misinterpret as having originated on the + local link. + + + + +Cheshire & Krochmal Standards Track [Page 38] + +RFC 6762 Multicast DNS February 2013 + + + The test for whether a response originated on the local link is done + in two ways: + + * All responses received with a destination address in the IP + header that is the mDNS IPv4 link-local multicast address + 224.0.0.251 or the mDNS IPv6 link-local multicast address + FF02::FB are necessarily deemed to have originated on the local + link, regardless of source IP address. This is essential to + allow devices to work correctly and reliably in unusual + configurations, such as multiple logical IP subnets overlayed on + a single link, or in cases of severe misconfiguration, where + devices are physically connected to the same link, but are + currently misconfigured with completely unrelated IP addresses + and subnet masks. + + * For responses received with a unicast destination address in the + IP header, the source IP address in the packet is checked to see + if it is an address on a local subnet. An IPv4 source address + is determined to be on a local subnet if, for (one of) the + address(es) configured on the interface receiving the packet, (I + & M) == (P & M), where I and M are the interface address and + subnet mask respectively, P is the source IP address from the + packet, '&' represents the bitwise logical 'and' operation, and + '==' represents a bitwise equality test. An IPv6 source address + is determined to be on the local link if, for any of the on-link + IPv6 prefixes on the interface receiving the packet (learned via + IPv6 router advertisements or otherwise configured on the host), + the first 'n' bits of the IPv6 source address match the first + 'n' bits of the prefix address, where 'n' is the length of the + prefix being considered. + + Since queriers will ignore responses apparently originating outside + the local subnet, a responder SHOULD avoid generating responses that + it can reasonably predict will be ignored. This applies particularly + in the case of overlayed subnets. If a responder receives a query + addressed to the mDNS IPv4 link-local multicast address 224.0.0.251, + from a source address not apparently on the same subnet as the + responder (or, in the case of IPv6, from a source IPv6 address for + which the responder does not have any address with the same prefix on + that interface), then even if the query indicates that a unicast + response is preferred (see Section 5.4, "Questions Requesting Unicast + Responses"), the responder SHOULD elect to respond by multicast + anyway, since it can reasonably predict that a unicast response with + an apparently non-local source address will probably be ignored. + + + + + + + +Cheshire & Krochmal Standards Track [Page 39] + +RFC 6762 Multicast DNS February 2013 + + +12. Special Characteristics of Multicast DNS Domains + + Unlike conventional DNS names, names that end in ".local." have only + local significance. The same is true of names within the IPv4 link- + local reverse mapping domain "254.169.in-addr.arpa." and the IPv6 + link-local reverse mapping domains "8.e.f.ip6.arpa.", + "9.e.f.ip6.arpa.", "a.e.f.ip6.arpa.", and "b.e.f.ip6.arpa.". + + These names function primarily as protocol identifiers, rather than + as user-visible identifiers. Even though they may occasionally be + visible to end users, that is not their primary purpose. As such, + these names should be treated as opaque identifiers. In particular, + the string "local" should not be translated or localized into + different languages, much as the name "localhost" is not translated + or localized into different languages. + + Conventional Unicast DNS seeks to provide a single unified namespace, + where a given DNS query yields the same answer no matter where on the + planet it is performed or to which recursive DNS server the query is + sent. In contrast, each IP link has its own private ".local.", + "254.169.in-addr.arpa." and IPv6 link-local reverse mapping + namespaces, and the answer to any query for a name within those + domains depends on where that query is asked. (This characteristic + is not unique to Multicast DNS. Although the original concept of DNS + was a single global namespace, in recent years, split views, + firewalls, intranets, DNS geolocation, and the like have increasingly + meant that the answer to a given DNS query has become dependent on + the location of the querier.) + + The IPv4 name server address for a Multicast DNS domain is + 224.0.0.251. The IPv6 name server address for a Multicast DNS domain + is FF02::FB. These are multicast addresses; therefore, they identify + not a single host but a collection of hosts, working in cooperation + to maintain some reasonable facsimile of a competently managed DNS + zone. Conceptually, a Multicast DNS domain is a single DNS zone; + however, its server is implemented as a distributed process running + on a cluster of loosely cooperating CPUs rather than as a single + process running on a single CPU. + + Multicast DNS domains are not delegated from their parent domain via + use of NS (Name Server) records, and there is also no concept of + delegation of subdomains within a Multicast DNS domain. Just because + a particular host on the network may answer queries for a particular + record type with the name "example.local." does not imply anything + about whether that host will answer for the name + "child.example.local.", or indeed for other record types with the + name "example.local.". + + + + +Cheshire & Krochmal Standards Track [Page 40] + +RFC 6762 Multicast DNS February 2013 + + + There are no NS records anywhere in Multicast DNS domains. Instead, + the Multicast DNS domains are reserved by IANA, and there is + effectively an implicit delegation of all Multicast DNS domains to + the 224.0.0.251:5353 and [FF02::FB]:5353 multicast groups, by virtue + of client software implementing the protocol rules specified in this + document. + + Multicast DNS zones have no SOA (Start of Authority) record. A + conventional DNS zone's SOA record contains information such as the + email address of the zone administrator and the monotonically + increasing serial number of the last zone modification. There is no + single human administrator for any given Multicast DNS zone, so there + is no email address. Because the hosts managing any given Multicast + DNS zone are only loosely coordinated, there is no readily available + monotonically increasing serial number to determine whether or not + the zone contents have changed. A host holding part of the shared + zone could crash or be disconnected from the network at any time + without informing the other hosts. There is no reliable way to + provide a zone serial number that would, whenever such a crash or + disconnection occurred, immediately change to indicate that the + contents of the shared zone had changed. + + Zone transfers are not possible for any Multicast DNS zone. + +13. Enabling and Disabling Multicast DNS + + The option to fail-over to Multicast DNS for names not ending in + ".local." SHOULD be a user-configured option, and SHOULD be disabled + by default because of the possible security issues related to + unintended local resolution of apparently global names. Enabling + Multicast DNS for names not ending in ".local." may be appropriate on + a secure isolated network, or on some future network were machines + exclusively use DNSSEC for all DNS queries, and have Multicast DNS + responders capable of generating the appropriate cryptographic DNSSEC + signatures, thereby guarding against spoofing. + + The option to look up unqualified (relative) names by appending + ".local." (or not) is controlled by whether ".local." appears (or + not) in the client's DNS search list. + + No special control is needed for enabling and disabling Multicast DNS + for names explicitly ending with ".local." as entered by the user. + The user doesn't need a way to disable Multicast DNS for names ending + with ".local.", because if the user doesn't want to use Multicast + DNS, they can achieve this by simply not using those names. If a + user *does* enter a name ending in ".local.", then we can safely + assume the user's intention was probably that it should work. Having + user configuration options that can be (intentionally or + + + +Cheshire & Krochmal Standards Track [Page 41] + +RFC 6762 Multicast DNS February 2013 + + + unintentionally) set so that local names don't work is just one more + way of frustrating the user's ability to perform the tasks they want, + perpetuating the view that, "IP networking is too complicated to + configure and too hard to use". + +14. Considerations for Multiple Interfaces + + A host SHOULD defend its dot-local host name on all active interfaces + on which it is answering Multicast DNS queries. + + In the event of a name conflict on *any* interface, a host should + configure a new host name, if it wishes to maintain uniqueness of its + host name. + + A host may choose to use the same name (or set of names) for all of + its address records on all interfaces, or it may choose to manage its + Multicast DNS interfaces independently, potentially answering to a + different name (or set of names) on different interfaces. + + Except in the case of proxying and other similar specialized uses, + addresses in IPv4 or IPv6 address records in Multicast DNS responses + MUST be valid for use on the interface on which the response is being + sent. + + Just as the same link-local IP address may validly be in use + simultaneously on different links by different hosts, the same link- + local host name may validly be in use simultaneously on different + links, and this is not an error. A multihomed host with connections + to two different links may be able to communicate with two different + hosts that are validly using the same name. While this kind of name + duplication should be rare, it means that a host that wants to fully + support this case needs network programming APIs that allow + applications to specify on what interface to perform a link-local + Multicast DNS query, and to discover on what interface a Multicast + DNS response was received. + + There is one other special precaution that multihomed hosts need to + take. It's common with today's laptop computers to have an Ethernet + connection and an 802.11 [IEEE.802.11] wireless connection active at + the same time. What the software on the laptop computer can't easily + tell is whether the wireless connection is in fact bridged onto the + same network segment as its Ethernet connection. If the two networks + are bridged together, then packets the host sends on one interface + will arrive on the other interface a few milliseconds later, and care + must be taken to ensure that this bridging does not cause problems: + + + + + + +Cheshire & Krochmal Standards Track [Page 42] + +RFC 6762 Multicast DNS February 2013 + + + When the host announces its host name (i.e., its address records) on + its wireless interface, those announcement records are sent with the + cache-flush bit set, so when they arrive on the Ethernet segment, + they will cause all the peers on the Ethernet to flush the host's + Ethernet address records from their caches. The Multicast DNS + protocol has a safeguard to protect against this situation: when + records are received with the cache-flush bit set, other records are + not deleted from peer caches immediately, but are marked for deletion + in one second. When the host sees its own wireless address records + arrive on its Ethernet interface, with the cache-flush bit set, this + one-second grace period gives the host time to respond and re- + announce its Ethernet address records, to reinstate those records in + peer caches before they are deleted. + + As described, this solves one problem, but creates another, because + when those Ethernet announcement records arrive back on the wireless + interface, the host would again respond defensively to reinstate its + wireless records, and this process would continue forever, + continuously flooding the network with traffic. The Multicast DNS + protocol has a second safeguard, to solve this problem: the cache- + flush bit does not apply to records received very recently, within + the last second. This means that when the host sees its own Ethernet + address records arrive on its wireless interface, with the cache- + flush bit set, it knows there's no need to re-announce its wireless + address records again because it already sent them less than a second + ago, and this makes them immune from deletion from peer caches. (See + Section 10.2.) + +15. Considerations for Multiple Responders on the Same Machine + + It is possible to have more than one Multicast DNS responder and/or + querier implementation coexist on the same machine, but there are + some known issues. + +15.1. Receiving Unicast Responses + + In most operating systems, incoming *multicast* packets can be + delivered to *all* open sockets bound to the right port number, + provided that the clients take the appropriate steps to allow this. + For this reason, all Multicast DNS implementations SHOULD use the + SO_REUSEPORT and/or SO_REUSEADDR options (or equivalent as + appropriate for the operating system in question) so they will all be + able to bind to UDP port 5353 and receive incoming multicast packets + addressed to that port. However, unlike multicast packets, incoming + unicast UDP packets are typically delivered only to the first socket + to bind to that port. This means that "QU" responses and other + packets sent via unicast will be received only by the first Multicast + DNS responder and/or querier on a system. This limitation can be + + + +Cheshire & Krochmal Standards Track [Page 43] + +RFC 6762 Multicast DNS February 2013 + + + partially mitigated if Multicast DNS implementations detect when they + are not the first to bind to port 5353, and in that case they do not + request "QU" responses. One way to detect if there is another + Multicast DNS implementation already running is to attempt binding to + port 5353 without using SO_REUSEPORT and/or SO_REUSEADDR, and if that + fails it indicates that some other socket is already bound to this + port. + +15.2. Multipacket Known-Answer lists + + When a Multicast DNS querier issues a query with too many Known + Answers to fit into a single packet, it divides the Known-Answer list + into two or more packets. Multicast DNS responders associate the + initial truncated query with its continuation packets by examining + the source IP address in each packet. Since two independent + Multicast DNS queriers running on the same machine will be sending + packets with the same source IP address, from an outside perspective + they appear to be a single entity. If both queriers happened to send + the same multipacket query at the same time, with different Known- + Answer lists, then they could each end up suppressing answers that + the other needs. + +15.3. Efficiency + + If different clients on a machine were each to have their own + independent Multicast DNS implementation, they would lose certain + efficiency benefits. Apart from the unnecessary code duplication, + memory usage, and CPU load, the clients wouldn't get the benefit of a + shared system-wide cache, and they would not be able to aggregate + separate queries into single packets to reduce network traffic. + +15.4. Recommendation + + Because of these issues, this document encourages implementers to + design systems with a single Multicast DNS implementation that + provides Multicast DNS services shared by all clients on that + machine, much as most operating systems today have a single TCP + implementation, which is shared between all clients on that machine. + Due to engineering constraints, there may be situations where + embedding a "user-level" Multicast DNS implementation in the client + application software is the most expedient solution, and while this + will usually work in practice, implementers should be aware of the + issues outlined in this section. + + + + + + + + +Cheshire & Krochmal Standards Track [Page 44] + +RFC 6762 Multicast DNS February 2013 + + +16. Multicast DNS Character Set + + Historically, Unicast DNS has been used with a very restricted set of + characters. Indeed, conventional DNS is usually limited to just + twenty-six letters, ten digits and the hyphen character, not even + allowing spaces or other punctuation. Attempts to remedy this for + Unicast DNS have been badly constrained by the perceived need to + accommodate old buggy legacy DNS implementations. In reality, the + DNS specification itself actually imposes no limits on what + characters may be used in names, and good DNS implementations handle + any arbitrary eight-bit data without trouble. "Clarifications to the + DNS Specification" [RFC2181] directly discusses the subject of + allowable character set in Section 11 ("Name syntax"), and explicitly + states that DNS names may contain arbitrary eight-bit data. However, + the old rules for ARPANET host names back in the 1980s required host + names to be just letters, digits, and hyphens [RFC1034], and since + the predominant use of DNS is to store host address records, many + have assumed that the DNS protocol itself suffers from the same + limitation. It might be accurate to say that there could be + hypothetical bad implementations that do not handle eight-bit data + correctly, but it would not be accurate to say that the protocol + doesn't allow names containing eight-bit data. + + Multicast DNS is a new protocol and doesn't (yet) have old buggy + legacy implementations to constrain the design choices. Accordingly, + it adopts the simple obvious elegant solution: all names in Multicast + DNS MUST be encoded as precomposed UTF-8 [RFC3629] "Net-Unicode" + [RFC5198] text. + + Some users of 16-bit Unicode have taken to stuffing a "zero-width + nonbreaking space" character (U+FEFF) at the start of each UTF-16 + file, as a hint to identify whether the data is big-endian or little- + endian, and calling it a "Byte Order Mark" (BOM). Since there is + only one possible byte order for UTF-8 data, a BOM is neither + necessary nor permitted. Multicast DNS names MUST NOT contain a + "Byte Order Mark". Any occurrence of the Unicode character U+FEFF at + the start or anywhere else in a Multicast DNS name MUST be + interpreted as being an actual intended part of the name, + representing (just as for any other legal unicode value) an actual + literal instance of that character (in this case a zero-width non- + breaking space character). + + For names that are restricted to US-ASCII [RFC0020] letters, digits, + and hyphens, the UTF-8 encoding is identical to the US-ASCII + encoding, so this is entirely compatible with existing host names. + For characters outside the US-ASCII range, UTF-8 encoding is used. + + + + + +Cheshire & Krochmal Standards Track [Page 45] + +RFC 6762 Multicast DNS February 2013 + + + Multicast DNS implementations MUST NOT use any other encodings apart + from precomposed UTF-8 (US-ASCII being considered a compatible subset + of UTF-8). The reasons for selecting UTF-8 instead of Punycode + [RFC3492] are discussed further in Appendix F. + + The simple rules for case-insensitivity in Unicast DNS [RFC1034] + [RFC1035] also apply in Multicast DNS; that is to say, in name + comparisons, the lowercase letters "a" to "z" (0x61 to 0x7A) match + their uppercase equivalents "A" to "Z" (0x41 to 0x5A). Hence, if a + querier issues a query for an address record with the name + "myprinter.local.", then a responder having an address record with + the name "MyPrinter.local." should issue a response. No other + automatic equivalences should be assumed. In particular, all UTF-8 + multibyte characters (codes 0x80 and higher) are compared by simple + binary comparison of the raw byte values. Accented characters are + *not* defined to be automatically equivalent to their unaccented + counterparts. Where automatic equivalences are desired, this may be + achieved through the use of programmatically generated CNAME records. + For example, if a responder has an address record for an accented + name Y, and a querier issues a query for a name X, where X is the + same as Y with all the accents removed, then the responder may issue + a response containing two resource records: a CNAME record "X CNAME + Y", asserting that the requested name X (unaccented) is an alias for + the true (accented) name Y, followed by the address record for Y. + +17. Multicast DNS Message Size + + The 1987 DNS specification [RFC1035] restricts DNS messages carried + by UDP to no more than 512 bytes (not counting the IP or UDP + headers). For UDP packets carried over the wide-area Internet in + 1987, this was appropriate. For link-local multicast packets on + today's networks, there is no reason to retain this restriction. + Given that the packets are by definition link-local, there are no + Path MTU issues to consider. + + Multicast DNS messages carried by UDP may be up to the IP MTU of the + physical interface, less the space required for the IP header (20 + bytes for IPv4; 40 bytes for IPv6) and the UDP header (8 bytes). + + In the case of a single Multicast DNS resource record that is too + large to fit in a single MTU-sized multicast response packet, a + Multicast DNS responder SHOULD send the resource record alone, in a + single IP datagram, using multiple IP fragments. Resource records + this large SHOULD be avoided, except in the very rare cases where + they really are the appropriate solution to the problem at hand. + Implementers should be aware that many simple devices do not + reassemble fragmented IP datagrams, so large resource records SHOULD + NOT be used except in specialized cases where the implementer knows + + + +Cheshire & Krochmal Standards Track [Page 46] + +RFC 6762 Multicast DNS February 2013 + + + that all receivers implement reassembly, or where the large resource + record contains optional data which is not essential for correct + operation of the client. + + A Multicast DNS packet larger than the interface MTU, which is sent + using fragments, MUST NOT contain more than one resource record. + + Even when fragmentation is used, a Multicast DNS packet, including IP + and UDP headers, MUST NOT exceed 9000 bytes. + + Note that 9000 bytes is also the maximum payload size of an Ethernet + "Jumbo" packet [Jumbo]. However, in practice Ethernet "Jumbo" + packets are not widely used, so it is advantageous to keep packets + under 1500 bytes whenever possible. Even on hosts that normally + handle Ethernet "Jumbo" packets and IP fragment reassembly, it is + becoming more common for these hosts to implement power-saving modes + where the main CPU goes to sleep and hands off packet reception tasks + to a more limited processor in the network interface hardware, which + may not support Ethernet "Jumbo" packets or IP fragment reassembly. + +18. Multicast DNS Message Format + + This section describes specific rules pertaining to the allowable + values for the header fields of a Multicast DNS message, and other + message format considerations. + +18.1. ID (Query Identifier) + + Multicast DNS implementations SHOULD listen for unsolicited responses + issued by hosts booting up (or waking up from sleep or otherwise + joining the network). Since these unsolicited responses may contain + a useful answer to a question for which the querier is currently + awaiting an answer, Multicast DNS implementations SHOULD examine all + received Multicast DNS response messages for useful answers, without + regard to the contents of the ID field or the Question Section. In + Multicast DNS, knowing which particular query message (if any) is + responsible for eliciting a particular response message is less + interesting than knowing whether the response message contains useful + information. + + Multicast DNS implementations MAY cache data from any or all + Multicast DNS response messages they receive, for possible future + use, provided of course that normal TTL aging is performed on these + cached resource records. + + In multicast query messages, the Query Identifier SHOULD be set to + zero on transmission. + + + + +Cheshire & Krochmal Standards Track [Page 47] + +RFC 6762 Multicast DNS February 2013 + + + In multicast responses, including unsolicited multicast responses, + the Query Identifier MUST be set to zero on transmission, and MUST be + ignored on reception. + + In legacy unicast response messages generated specifically in + response to a particular (unicast or multicast) query, the Query + Identifier MUST match the ID from the query message. + +18.2. QR (Query/Response) Bit + + In query messages the QR bit MUST be zero. + In response messages the QR bit MUST be one. + +18.3. OPCODE + + In both multicast query and multicast response messages, the OPCODE + MUST be zero on transmission (only standard queries are currently + supported over multicast). Multicast DNS messages received with an + OPCODE other than zero MUST be silently ignored. + +18.4. AA (Authoritative Answer) Bit + + In query messages, the Authoritative Answer bit MUST be zero on + transmission, and MUST be ignored on reception. + + In response messages for Multicast domains, the Authoritative Answer + bit MUST be set to one (not setting this bit would imply there's some + other place where "better" information may be found) and MUST be + ignored on reception. + +18.5. TC (Truncated) Bit + + In query messages, if the TC bit is set, it means that additional + Known-Answer records may be following shortly. A responder SHOULD + record this fact, and wait for those additional Known-Answer records, + before deciding whether to respond. If the TC bit is clear, it means + that the querying host has no additional Known Answers. + + In multicast response messages, the TC bit MUST be zero on + transmission, and MUST be ignored on reception. + + In legacy unicast response messages, the TC bit has the same meaning + as in conventional Unicast DNS: it means that the response was too + large to fit in a single packet, so the querier SHOULD reissue its + query using TCP in order to receive the larger response. + + + + + + +Cheshire & Krochmal Standards Track [Page 48] + +RFC 6762 Multicast DNS February 2013 + + +18.6. RD (Recursion Desired) Bit + + In both multicast query and multicast response messages, the + Recursion Desired bit SHOULD be zero on transmission, and MUST be + ignored on reception. + +18.7. RA (Recursion Available) Bit + + In both multicast query and multicast response messages, the + Recursion Available bit MUST be zero on transmission, and MUST be + ignored on reception. + +18.8. Z (Zero) Bit + + In both query and response messages, the Zero bit MUST be zero on + transmission, and MUST be ignored on reception. + +18.9. AD (Authentic Data) Bit + + In both multicast query and multicast response messages, the + Authentic Data bit [RFC2535] MUST be zero on transmission, and MUST + be ignored on reception. + +18.10. CD (Checking Disabled) Bit + + In both multicast query and multicast response messages, the Checking + Disabled bit [RFC2535] MUST be zero on transmission, and MUST be + ignored on reception. + +18.11. RCODE (Response Code) + + In both multicast query and multicast response messages, the Response + Code MUST be zero on transmission. Multicast DNS messages received + with non-zero Response Codes MUST be silently ignored. + +18.12. Repurposing of Top Bit of qclass in Question Section + + In the Question Section of a Multicast DNS query, the top bit of the + qclass field is used to indicate that unicast responses are preferred + for this particular question. (See Section 5.4.) + +18.13. Repurposing of Top Bit of rrclass in Resource Record Sections + + In the Resource Record Sections of a Multicast DNS response, the top + bit of the rrclass field is used to indicate that the record is a + member of a unique RRSet, and the entire RRSet has been sent together + (in the same packet, or in consecutive packets if there are too many + records to fit in a single packet). (See Section 10.2.) + + + +Cheshire & Krochmal Standards Track [Page 49] + +RFC 6762 Multicast DNS February 2013 + + +18.14. Name Compression + + When generating Multicast DNS messages, implementations SHOULD use + name compression wherever possible to compress the names of resource + records, by replacing some or all of the resource record name with a + compact two-byte reference to an appearance of that data somewhere + earlier in the message [RFC1035]. + + This applies not only to Multicast DNS responses, but also to + queries. When a query contains more than one question, successive + questions in the same message often contain similar names, and + consequently name compression SHOULD be used, to save bytes. In + addition, queries may also contain Known Answers in the Answer + Section, or probe tiebreaking data in the Authority Section, and + these names SHOULD similarly be compressed for network efficiency. + + In addition to compressing the *names* of resource records, names + that appear within the *rdata* of the following rrtypes SHOULD also + be compressed in all Multicast DNS messages: + + NS, CNAME, PTR, DNAME, SOA, MX, AFSDB, RT, KX, RP, PX, SRV, NSEC + + Until future IETF Standards Action [RFC5226] specifying that names in + the rdata of other types should be compressed, names that appear + within the rdata of any type not listed above MUST NOT be compressed. + + Implementations receiving Multicast DNS messages MUST correctly + decode compressed names appearing in the Question Section, and + compressed names of resource records appearing in other sections. + + In addition, implementations MUST correctly decode compressed names + appearing within the *rdata* of the rrtypes listed above. Where + possible, implementations SHOULD also correctly decode compressed + names appearing within the *rdata* of other rrtypes known to the + implementers at the time of implementation, because such forward- + thinking planning helps facilitate the deployment of future + implementations that may have reason to compress those rrtypes. It + is possible that no future IETF Standards Action [RFC5226] will be + created that mandates or permits the compression of rdata in new + types, but having implementations designed such that they are capable + of decompressing all known types helps keep future options open. + + One specific difference between Unicast DNS and Multicast DNS is that + Unicast DNS does not allow name compression for the target host in an + SRV record, because Unicast DNS implementations before the first SRV + specification in 1996 [RFC2052] may not decode these compressed + + + + + +Cheshire & Krochmal Standards Track [Page 50] + +RFC 6762 Multicast DNS February 2013 + + + records properly. Since all Multicast DNS implementations were + created after 1996, all Multicast DNS implementations are REQUIRED to + decode compressed SRV records correctly. + + In legacy unicast responses generated to answer legacy queries, name + compression MUST NOT be performed on SRV records. + +19. Summary of Differences between Multicast DNS and Unicast DNS + + Multicast DNS shares, as much as possible, the familiar APIs, naming + syntax, resource record types, etc., of Unicast DNS. There are, of + course, necessary differences by virtue of it using multicast, and by + virtue of it operating in a community of cooperating peers, rather + than a precisely defined hierarchy controlled by a strict chain of + formal delegations from the root. These differences are summarized + below: + + Multicast DNS... + * uses multicast + * uses UDP port 5353 instead of port 53 + * operates in well-defined parts of the DNS namespace + * has no SOA (Start of Authority) records + * uses UTF-8, and only UTF-8, to encode resource record names + * allows names up to 255 bytes plus a terminating zero byte + * allows name compression in rdata for SRV and other record types + * allows larger UDP packets + * allows more than one question in a query message + * defines consistent results for qtype "ANY" and qclass "ANY" queries + * uses the Answer Section of a query to list Known Answers + * uses the TC bit in a query to indicate additional Known Answers + * uses the Authority Section of a query for probe tiebreaking + * ignores the Query ID field (except for generating legacy responses) + * doesn't require the question to be repeated in the response message + * uses unsolicited responses to announce new records + * uses NSEC records to signal nonexistence of records + * defines a unicast-response bit in the rrclass of query questions + * defines a cache-flush bit in the rrclass of response records + * uses DNS RR TTL 0 to indicate that a record has been deleted + * recommends AAAA records in the additional section when responding + to rrtype "A" queries, and vice versa + * monitors queries to perform Duplicate Question Suppression + * monitors responses to perform Duplicate Answer Suppression... + * ... and Ongoing Conflict Detection + * ... and Opportunistic Caching + + + + + + + +Cheshire & Krochmal Standards Track [Page 51] + +RFC 6762 Multicast DNS February 2013 + + +20. IPv6 Considerations + + An IPv4-only host and an IPv6-only host behave as "ships that pass in + the night". Even if they are on the same Ethernet, neither is aware + of the other's traffic. For this reason, each physical link may have + *two* unrelated ".local." zones, one for IPv4 and one for IPv6. + Since for practical purposes, a group of IPv4-only hosts and a group + of IPv6-only hosts on the same Ethernet act as if they were on two + entirely separate Ethernet segments, it is unsurprising that their + use of the ".local." zone should occur exactly as it would if they + really were on two entirely separate Ethernet segments. + + A dual-stack (v4/v6) host can participate in both ".local." zones, + and should register its name(s) and perform its lookups both using + IPv4 and IPv6. This enables it to reach, and be reached by, both + IPv4-only and IPv6-only hosts. In effect, this acts like a + multihomed host, with one connection to the logical "IPv4 Ethernet + segment", and a connection to the logical "IPv6 Ethernet segment". + When such a host generates NSEC records, if it is using the same host + name for its IPv4 addresses and its IPv6 addresses on that network + interface, its NSEC records should indicate that the host name has + both A and AAAA records. + +21. Security Considerations + + The algorithm for detecting and resolving name conflicts is, by its + very nature, an algorithm that assumes cooperating participants. Its + purpose is to allow a group of hosts to arrive at a mutually disjoint + set of host names and other DNS resource record names, in the absence + of any central authority to coordinate this or mediate disputes. In + the absence of any higher authority to resolve disputes, the only + alternative is that the participants must work together cooperatively + to arrive at a resolution. + + In an environment where the participants are mutually antagonistic + and unwilling to cooperate, other mechanisms are appropriate, like + manually configured DNS. + + In an environment where there is a group of cooperating participants, + but clients cannot be sure that there are no antagonistic hosts on + the same physical link, the cooperating participants need to use + IPsec signatures and/or DNSSEC [RFC4033] signatures so that they can + distinguish Multicast DNS messages from trusted participants (which + they process as usual) from Multicast DNS messages from untrusted + participants (which they silently discard). + + + + + + +Cheshire & Krochmal Standards Track [Page 52] + +RFC 6762 Multicast DNS February 2013 + + + If DNS queries for *global* DNS names are sent to the mDNS multicast + address (during network outages which disrupt communication with the + greater Internet) it is *especially* important to use DNSSEC, because + the user may have the impression that he or she is communicating with + some authentic host, when in fact he or she is really communicating + with some local host that is merely masquerading as that name. This + is less critical for names ending with ".local.", because the user + should be aware that those names have only local significance and no + global authority is implied. + + Most computer users neglect to type the trailing dot at the end of a + fully qualified domain name, making it a relative domain name (e.g., + "www.example.com"). In the event of network outage, attempts to + positively resolve the name as entered will fail, resulting in + application of the search list, including ".local.", if present. A + malicious host could masquerade as "www.example.com." by answering + the resulting Multicast DNS query for "www.example.com.local.". To + avoid this, a host MUST NOT append the search suffix ".local.", if + present, to any relative (partially qualified) host name containing + two or more labels. Appending ".local." to single-label relative + host names is acceptable, since the user should have no expectation + that a single-label host name will resolve as is. However, users who + have both "example.com" and "local" in their search lists should be + aware that if they type "www" into their web browser, it may not be + immediately clear to them whether the page that appears is + "www.example.com" or "www.local". + + Multicast DNS uses UDP port 5353. On operating systems where only + privileged processes are allowed to use ports below 1024, no such + privilege is required to use port 5353. + +22. IANA Considerations + + IANA has allocated the UDP port 5353 for the Multicast DNS protocol + described in this document [SN]. + + IANA has allocated the IPv4 link-local multicast address 224.0.0.251 + for the use described in this document [MC4]. + + IANA has allocated the IPv6 multicast address set FF0X::FB (where "X" + indicates any hexadecimal digit from '1' to 'F') for the use + described in this document [MC6]. Only address FF02::FB (link-local + scope) is currently in use by deployed software, but it is possible + that in the future implementers may experiment with Multicast DNS + using larger-scoped addresses, such as FF05::FB (site-local scope) + [RFC4291]. + + + + + +Cheshire & Krochmal Standards Track [Page 53] + +RFC 6762 Multicast DNS February 2013 + + + IANA has implemented the following DNS records: + + MDNS.MCAST.NET. IN A 224.0.0.251 + 251.0.0.224.IN-ADDR.ARPA. IN PTR MDNS.MCAST.NET. + + Entries for the AAAA and corresponding PTR records have not been made + as there is not yet an RFC providing direction for the management of + the IP6.ARPA domain relating to the IPv6 multicast address space. + + The reuse of the top bit of the rrclass field in the Question and + Resource Record Sections means that Multicast DNS can only carry DNS + records with classes in the range 0-32767. Classes in the range + 32768 to 65535 are incompatible with Multicast DNS. IANA has noted + this fact, and if IANA receives a request to allocate a DNS class + value above 32767, IANA will make sure the requester is aware of this + implication before proceeding. This does not mean that allocations + of DNS class values above 32767 should be denied, only that they + should not be allowed until the requester has indicated that they are + aware of how this allocation will interact with Multicast DNS. + However, to date, only three DNS classes have been assigned by IANA + (1, 3, and 4), and only one (1, "Internet") is actually in widespread + use, so this issue is likely to remain a purely theoretical one. + + IANA has recorded the list of domains below as being Special-Use + Domain Names [RFC6761]: + + .local. + .254.169.in-addr.arpa. + .8.e.f.ip6.arpa. + .9.e.f.ip6.arpa. + .a.e.f.ip6.arpa. + .b.e.f.ip6.arpa. + +22.1. Domain Name Reservation Considerations + + The six domains listed above, and any names falling within those + domains (e.g., "MyPrinter.local.", "34.12.254.169.in-addr.arpa.", + "Ink-Jet._pdl-datastream._tcp.local.") are special [RFC6761] in the + following ways: + + 1. Users may use these names as they would other DNS names, + entering them anywhere that they would otherwise enter a + conventional DNS name, or a dotted decimal IPv4 address, or a + literal IPv6 address. + + Since there is no central authority responsible for assigning + dot-local names, and all devices on the local network are + equally entitled to claim any dot-local name, users SHOULD be + + + +Cheshire & Krochmal Standards Track [Page 54] + +RFC 6762 Multicast DNS February 2013 + + + aware of this and SHOULD exercise appropriate caution. In an + untrusted or unfamiliar network environment, users SHOULD be + aware that using a name like "www.local" may not actually + connect them to the web site they expected, and could easily + connect them to a different web page, or even a fake or spoof + of their intended web site, designed to trick them into + revealing confidential information. As always with networking, + end-to-end cryptographic security can be a useful tool. For + example, when connecting with ssh, the ssh host key + verification process will inform the user if it detects that + the identity of the entity they are communicating with has + changed since the last time they connected to that name. + + 2. Application software may use these names as they would other + similar DNS names, and is not required to recognize the names + and treat them specially. Due to the relative ease of spoofing + dot-local names, end-to-end cryptographic security remains + important when communicating across a local network, just as it + is when communicating across the global Internet. + + 3. Name resolution APIs and libraries SHOULD recognize these names + as special and SHOULD NOT send queries for these names to their + configured (unicast) caching DNS server(s). This is to avoid + unnecessary load on the root name servers and other name + servers, caused by queries for which those name servers do not + have useful non-negative answers to give, and will not ever + have useful non-negative answers to give. + + 4. Caching DNS servers SHOULD recognize these names as special and + SHOULD NOT attempt to look up NS records for them, or otherwise + query authoritative DNS servers in an attempt to resolve these + names. Instead, caching DNS servers SHOULD generate immediate + NXDOMAIN responses for all such queries they may receive (from + misbehaving name resolver libraries). This is to avoid + unnecessary load on the root name servers and other name + servers. + + 5. Authoritative DNS servers SHOULD NOT by default be configurable + to answer queries for these names, and, like caching DNS + servers, SHOULD generate immediate NXDOMAIN responses for all + such queries they may receive. DNS server software MAY provide + a configuration option to override this default, for testing + purposes or other specialized uses. + + 6. DNS server operators SHOULD NOT attempt to configure + authoritative DNS servers to act as authoritative for any of + these names. Configuring an authoritative DNS server to act as + authoritative for any of these names may not, in many cases, + + + +Cheshire & Krochmal Standards Track [Page 55] + +RFC 6762 Multicast DNS February 2013 + + + yield the expected result. Since name resolver libraries and + caching DNS servers SHOULD NOT send queries for those names + (see 3 and 4 above), such queries SHOULD be suppressed before + they even reach the authoritative DNS server in question, and + consequently it will not even get an opportunity to answer + them. + + 7. DNS Registrars MUST NOT allow any of these names to be + registered in the normal way to any person or entity. These + names are reserved protocol identifiers with special meaning + and fall outside the set of names available for allocation by + registrars. Attempting to allocate one of these names as if it + were a normal domain name will probably not work as desired, + for reasons 3, 4, and 6 above. + +23. Acknowledgments + + The concepts described in this document have been explored, + developed, and implemented with help from Ran Atkinson, Richard + Brown, Freek Dijkstra, Erik Guttman, Kyle McKay, Pasi Sarolahti, + Pekka Savola, Robby Simpson, Mark Townsley, Paul Vixie, Bill + Woodcock, and others. Special thanks go to Bob Bradley, Josh + Graessley, Scott Herscher, Rory McGuire, Roger Pantos, and Kiren + Sekar for their significant contributions. Special thanks also to + Kerry Lynn for converting the document to xml2rfc form in May 2010, + and to Area Director Ralph Droms for shepherding the document through + its final steps. + +24. References + +24.1. Normative References + + [MC4] IANA, "IPv4 Multicast Address Space Registry", + . + + [MC6] IANA, "IPv6 Multicast Address Space Registry", + . + + [RFC0020] Cerf, V., "ASCII format for network interchange", RFC 20, + October 1969. + + [RFC1034] Mockapetris, P., "Domain names - concepts and facilities", + STD 13, RFC 1034, November 1987. + + [RFC1035] Mockapetris, P., "Domain names - implementation and + specification", STD 13, RFC 1035, November 1987. + + + + +Cheshire & Krochmal Standards Track [Page 56] + +RFC 6762 Multicast DNS February 2013 + + + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate + Requirement Levels", BCP 14, RFC 2119, March 1997. + + [RFC3629] Yergeau, F., "UTF-8, a transformation format of ISO + 10646", STD 63, RFC 3629, November 2003. + + [RFC4034] Arends, R., Austein, R., Larson, M., Massey, D., and S. + Rose, "Resource Records for the DNS Security Extensions", + RFC 4034, March 2005. + + [RFC5198] Klensin, J. and M. Padlipsky, "Unicode Format for Network + Interchange", RFC 5198, March 2008. + + [RFC6195] Eastlake 3rd, D., "Domain Name System (DNS) IANA + Considerations", BCP 42, RFC 6195, March 2011. + + [RFC6761] Cheshire, S. and M. Krochmal, "Special-Use Domain Names", + RFC 6761, February 2013. + + [SN] IANA, "Service Name and Transport Protocol Port Number + Registry", . + +24.2. Informative References + + [B4W] "Bonjour for Windows", + . + + [BJ] Apple Bonjour Open Source Software, + . + + [IEEE.802.3] + "Information technology - Telecommunications and + information exchange between systems - Local and + metropolitan area networks - Specific requirements - Part + 3: Carrier Sense Multiple Access with Collision Detection + (CMSA/CD) Access Method and Physical Layer + Specifications", IEEE Std 802.3-2008, December 2008, + . + + [IEEE.802.11] + "Information technology - Telecommunications and + information exchange between systems - Local and + metropolitan area networks - Specific requirements - Part + 11: Wireless LAN Medium Access Control (MAC) and Physical + Layer (PHY) Specifications", IEEE Std 802.11-2007, June + 2007, . + + + + +Cheshire & Krochmal Standards Track [Page 57] + +RFC 6762 Multicast DNS February 2013 + + + [Jumbo] "Ethernet Jumbo Frames", November 2009, + . + + [NIAS] Cheshire, S. "Discovering Named Instances of Abstract + Services using DNS", Work in Progress, July 2001. + + [NSD] "NsdManager | Android Developer", June 2012, + . + + [RFC2052] Gulbrandsen, A. and P. Vixie, "A DNS RR for specifying the + location of services (DNS SRV)", RFC 2052, October 1996. + + [RFC2132] Alexander, S. and R. Droms, "DHCP Options and BOOTP Vendor + Extensions", RFC 2132, March 1997. + + [RFC2136] Vixie, P., Ed., Thomson, S., Rekhter, Y., and J. Bound, + "Dynamic Updates in the Domain Name System (DNS UPDATE)", + RFC 2136, April 1997. + + [RFC2181] Elz, R. and R. Bush, "Clarifications to the DNS + Specification", RFC 2181, July 1997. + + [RFC2535] Eastlake 3rd, D., "Domain Name System Security + Extensions", RFC 2535, March 1999. + + [RFC2671] Vixie, P., "Extension Mechanisms for DNS (EDNS0)", RFC + 2671, August 1999. + + [RFC2845] Vixie, P., Gudmundsson, O., Eastlake 3rd, D., and B. + Wellington, "Secret Key Transaction Authentication for DNS + (TSIG)", RFC 2845, May 2000. + + [RFC2930] Eastlake 3rd, D., "Secret Key Establishment for DNS (TKEY + RR)", RFC 2930, September 2000. + + [RFC2931] Eastlake 3rd, D., "DNS Request and Transaction Signatures + ( SIG(0)s )", RFC 2931, September 2000. + + [RFC3007] Wellington, B., "Secure Domain Name System (DNS) Dynamic + Update", RFC 3007, November 2000. + + [RFC3492] Costello, A., "Punycode: A Bootstring encoding of Unicode + for Internationalized Domain Names in Applications + (IDNA)", RFC 3492, March 2003. + + + + + +Cheshire & Krochmal Standards Track [Page 58] + +RFC 6762 Multicast DNS February 2013 + + + [RFC3927] Cheshire, S., Aboba, B., and E. Guttman, "Dynamic + Configuration of IPv4 Link-Local Addresses", RFC 3927, May + 2005. + + [RFC4033] Arends, R., Austein, R., Larson, M., Massey, D., and S. + Rose, "DNS Security Introduction and Requirements", RFC + 4033, March 2005. + + [RFC4291] Hinden, R. and S. Deering, "IP Version 6 Addressing + Architecture", RFC 4291, February 2006. + + [RFC4795] Aboba, B., Thaler, D., and L. Esibov, "Link-local + Multicast Name Resolution (LLMNR)", RFC 4795, January + 2007. + + [RFC4861] Narten, T., Nordmark, E., Simpson, W., and H. Soliman, + "Neighbor Discovery for IP version 6 (IPv6)", RFC 4861, + September 2007. + + [RFC4862] Thomson, S., Narten, T., and T. Jinmei, "IPv6 Stateless + Address Autoconfiguration", RFC 4862, September 2007. + + [RFC5226] Narten, T. and H. Alvestrand, "Guidelines for Writing an + IANA Considerations Section in RFCs", BCP 26, RFC 5226, + May 2008. + + [RFC5890] Klensin, J., "Internationalized Domain Names for + Applications (IDNA): Definitions and Document Framework", + RFC 5890, August 2010. + + [RFC6281] Cheshire, S., Zhu, Z., Wakikawa, R., and L. Zhang, + "Understanding Apple's Back to My Mac (BTMM) Service", RFC + 6281, June 2011. + + [RFC6760] Cheshire, S. and M. Krochmal, "Requirements for a Protocol + to Replace the AppleTalk Name Binding Protocol (NBP)", RFC + 6760, February 2013. + + [RFC6763] Cheshire, S. and M. Krochmal, "DNS-Based Service + Discovery", RFC 6763, February 2013. + + [Zeroconf] Cheshire, S. and D. Steinberg, "Zero Configuration + Networking: The Definitive Guide", O'Reilly Media, Inc., + ISBN 0-596-10100-7, December 2005. + + + + + + + +Cheshire & Krochmal Standards Track [Page 59] + +RFC 6762 Multicast DNS February 2013 + + +Appendix A. Design Rationale for Choice of UDP Port Number + + Arguments were made for and against using UDP port 53, the standard + Unicast DNS port. Some of the arguments are given below. The + arguments for using a different port were greater in number and more + compelling, so that option was ultimately selected. The UDP port + "5353" was selected for its mnemonic similarity to "53". + + Arguments for using UDP port 53: + + * This is "just DNS", so it should be the same port. + + * There is less work to be done updating old resolver libraries to do + simple Multicast DNS queries. Only the destination address need be + changed. In some cases, this can be achieved without any code + changes, just by adding the address 224.0.0.251 to a configuration + file. + + Arguments for using a different port (UDP port 5353): + + * This is not "just DNS". This is a DNS-like protocol, but + different. + + * Changing resolver library code to use a different port number is + not hard. In some cases, this can be achieved without any code + changes, just by adding the address 224.0.0.251:5353 to a + configuration file. + + * Using the same port number makes it hard to run a Multicast DNS + responder and a conventional Unicast DNS server on the same + machine. If a conventional Unicast DNS server wishes to implement + Multicast DNS as well, it can still do that, by opening two + sockets. Having two different port numbers allows this + flexibility. + + * Some VPN software hijacks all outgoing traffic to port 53 and + redirects it to a special DNS server set up to serve those VPN + clients while they are connected to the corporate network. It is + questionable whether this is the right thing to do, but it is + common, and redirecting link-local multicast DNS packets to a + remote server rarely produces any useful results. It does mean, + for example, that a user of such VPN software becomes unable to + access their local network printer sitting on their desk right next + to their computer. Using a different UDP port helps avoid this + particular problem. + + + + + + +Cheshire & Krochmal Standards Track [Page 60] + +RFC 6762 Multicast DNS February 2013 + + + * On many operating systems, unprivileged software may not send or + receive packets on low-numbered ports. This means that any + software sending or receiving Multicast DNS packets on port 53 + would have to run as "root", which is an undesirable security risk. + Using a higher-numbered UDP port avoids this restriction. + +Appendix B. Design Rationale for Not Using Hashed Multicast Addresses + + Some discovery protocols use a range of multicast addresses, and + determine the address to be used by a hash function of the name being + sought. Queries are sent via multicast to the address as indicated + by the hash function, and responses are returned to the querier via + unicast. Particularly in IPv6, where multicast addresses are + extremely plentiful, this approach is frequently advocated. For + example, IPv6 Neighbor Discovery [RFC4861] sends Neighbor + Solicitation messages to the "solicited-node multicast address", + which is computed as a function of the solicited IPv6 address. + + There are some disadvantages to using hashed multicast addresses like + this in a service discovery protocol: + + * When a host has a large number of records with different names, the + host may have to join a large number of multicast groups. Each + time a host joins or leaves a multicast group, this results in + Internet Group Management Protocol (IGMP) or Multicast Listener + Discovery (MLD) traffic on the network announcing this fact. + Joining a large number of multicast groups can place undue burden + on the Ethernet hardware, which typically supports a limited number + of multicast addresses efficiently. When this number is exceeded, + the Ethernet hardware may have to resort to receiving all + multicasts and passing them up to the host networking code for + filtering in software, thereby defeating much of the point of using + a multicast address range in the first place. Finally, many IPv6 + stacks have a fixed limit IPV6_MAX_MEMBERSHIPS, and the code simply + fails with an error if a client attempts to exceed this limit. + Common values for IPV6_MAX_MEMBERSHIPS are 20 or 31. + + * Multiple questions cannot be placed in one packet if they don't all + hash to the same multicast address. + + * Duplicate Question Suppression doesn't work if queriers are not + seeing each other's queries. + + * Duplicate Answer Suppression doesn't work if responders are not + seeing each other's responses. + + * Opportunistic Caching doesn't work. + + + + +Cheshire & Krochmal Standards Track [Page 61] + +RFC 6762 Multicast DNS February 2013 + + + * Ongoing Conflict Detection doesn't work. + +Appendix C. Design Rationale for Maximum Multicast DNS Name Length + + Multicast DNS names may be up to 255 bytes long (in the on-the-wire + message format), not counting the terminating zero byte at the end. + + "Domain Names - Implementation and Specification" [RFC1035] says: + + Various objects and parameters in the DNS have size limits. They + are listed below. Some could be easily changed, others are more + fundamental. + + labels 63 octets or less + + names 255 octets or less + + ... + + the total length of a domain name (i.e., label octets and label + length octets) is restricted to 255 octets or less. + + This text does not state whether this 255-byte limit includes the + terminating zero at the end of every name. + + Several factors lead us to conclude that the 255-byte limit does + *not* include the terminating zero: + + o It is common in software engineering to have size limits that are a + power of two, or a multiple of a power of two, for efficiency. For + example, an integer on a modern processor is typically 2, 4, or 8 + bytes, not 3 or 5 bytes. The number 255 is not a power of two, nor + is it to most people a particularly noteworthy number. It is + noteworthy to computer scientists for only one reason -- because it + is exactly one *less* than a power of two. When a size limit is + exactly one less than a power of two, that suggests strongly that + the one extra byte is being reserved for some specific reason -- in + this case reserved, perhaps, to leave room for a terminating zero + at the end. + + o In the case of DNS label lengths, the stated limit is 63 bytes. As + with the total name length, this limit is exactly one less than a + power of two. This label length limit also excludes the label + length byte at the start of every label. Including that extra + byte, a 63-byte label takes 64 bytes of space in memory or in a DNS + message. + + + + + +Cheshire & Krochmal Standards Track [Page 62] + +RFC 6762 Multicast DNS February 2013 + + + o It is common in software engineering for the semantic "length" of + an object to be one less than the number of bytes it takes to store + that object. For example, in C, strlen("foo") is 3, but + sizeof("foo") (which includes the terminating zero byte at the end) + is 4. + + o The text describing the total length of a domain name mentions + explicitly that label length and data octets are included, but does + not mention the terminating zero at the end. The zero byte at the + end of a domain name is not a label length. Indeed, the value zero + is chosen as the terminating marker precisely because it is not a + legal length byte value -- DNS prohibits empty labels. For + example, a name like "bad..name." is not a valid domain name + because it contains a zero-length label in the middle, which cannot + be expressed in a DNS message, because software parsing the message + would misinterpret a zero label-length byte as being a zero "end of + name" marker instead. + + Finally, "Clarifications to the DNS Specification" [RFC2181] offers + additional confirmation that, in the context of DNS specifications, + the stated "length" of a domain name does not include the terminating + zero byte at the end. That document refers to the root name, which + is typically written as "." and is represented in a DNS message by a + single lone zero byte (i.e., zero bytes of data plus a terminating + zero), as the "zero length full name": + + The zero length full name is defined as representing the root of + the DNS tree, and is typically written and displayed as ".". + + This wording supports the interpretation that, in a DNS context, when + talking about lengths of names, the terminating zero byte at the end + is not counted. If the root name (".") is considered to be zero + length, then to be consistent, the length (for example) of "org" has + to be 4 and the length of "ietf.org" has to be 9, as shown below: + + ------ + | 0x00 | length = 0 + ------ + + ------------------ ------ + | 0x03 | o | r | g | | 0x00 | length = 4 + ------------------ ------ + + ----------------------------------------- ------ + | 0x04 | i | e | t | f | 0x03 | o | r | g | | 0x00 | length = 9 + ----------------------------------------- ------ + + + + + +Cheshire & Krochmal Standards Track [Page 63] + +RFC 6762 Multicast DNS February 2013 + + + This means that the maximum length of a domain name, as represented + in a Multicast DNS message, up to but not including the final + terminating zero, must not exceed 255 bytes. + + However, many Unicast DNS implementers have read these RFCs + differently, and argue that the 255-byte limit does include the + terminating zero, and that the "Clarifications to the DNS + Specification" [RFC2181] statement that "." is the "zero length full + name" was simply a mistake. + + Hence, implementers should be aware that other Unicast DNS + implementations may limit the maximum domain name to 254 bytes plus a + terminating zero, depending on how that implementer interpreted the + DNS specifications. + + Compliant Multicast DNS implementations MUST support names up to 255 + bytes plus a terminating zero, i.e., 256 bytes total. + +Appendix D. Benefits of Multicast Responses + + Some people have argued that sending responses via multicast is + inefficient on the network. In fact, using multicast responses can + result in a net lowering of overall multicast traffic for a variety + of reasons, and provides other benefits too: + + * Opportunistic Caching. One multicast response can update the + caches on all machines on the network. If another machine later + wants to issue the same query, and it already has the answer in its + cache, it may not need to even transmit that multicast query on the + network at all. + + * Duplicate Query Suppression. When more than one machine has the + same ongoing long-lived query running, every machine does not have + to transmit its own independent query. When one machine transmits + a query, all the other hosts see the answers, so they can suppress + their own queries. + + * Passive Observation Of Failures (POOF). When a host sees a + multicast query, but does not see the corresponding multicast + response, it can use this information to promptly delete stale data + from its cache. To achieve the same level of user-interface + quality and responsiveness without multicast responses would + require lower cache lifetimes and more frequent network polling, + resulting in a higher packet rate. + + * Passive Conflict Detection. Just because a name has been + previously verified to be unique does not guarantee it will + continue to be so indefinitely. By allowing all Multicast DNS + + + +Cheshire & Krochmal Standards Track [Page 64] + +RFC 6762 Multicast DNS February 2013 + + + responders to constantly monitor their peers' responses, conflicts + arising out of network topology changes can be promptly detected + and resolved. If responses were not sent via multicast, some other + conflict detection mechanism would be needed, imposing its own + additional burden on the network. + + * Use on devices with constrained memory resources: When using + delayed responses to reduce network collisions, responders need to + maintain a list recording to whom each answer should be sent. The + option of multicast responses allows responders with limited + storage, which cannot store an arbitrarily long list of response + addresses, to choose to fail-over to a single multicast response in + place of multiple unicast responses, when appropriate. + + * Overlayed Subnets. In the case of overlayed subnets, multicast + responses allow a receiver to know with certainty that a response + originated on the local link, even when its source address may + apparently suggest otherwise. + + * Robustness in the face of misconfiguration: Link-local multicast + transcends virtually every conceivable network misconfiguration. + Even if you have a collection of devices where every device's IP + address, subnet mask, default gateway, and DNS server address are + all wrong, packets sent by any of those devices addressed to a + link-local multicast destination address will still be delivered to + all peers on the local link. This can be extremely helpful when + diagnosing and rectifying network problems, since it facilitates a + direct communication channel between client and server that works + without reliance on ARP, IP routing tables, etc. Being able to + discover what IP address a device has (or thinks it has) is + frequently a very valuable first step in diagnosing why it is + unable to communicate on the local network. + +Appendix E. Design Rationale for Encoding Negative Responses + + Alternative methods of asserting nonexistence were considered, such + as using an NXDOMAIN response, or emitting a resource record with + zero-length rdata. + + Using an NXDOMAIN response does not work well with Multicast DNS. A + Unicast DNS NXDOMAIN response applies to the entire message, but for + efficiency Multicast DNS allows (and encourages) multiple responses + in a single message. If the error code in the header were NXDOMAIN, + it would not be clear to which name(s) that error code applied. + + Asserting nonexistence by emitting a resource record with zero-length + rdata would mean that there would be no way to differentiate between + a record that doesn't exist, and a record that does exist, with zero- + + + +Cheshire & Krochmal Standards Track [Page 65] + +RFC 6762 Multicast DNS February 2013 + + + length rdata. By analogy, most file systems today allow empty files, + so a file that exists with zero bytes of data is not considered + equivalent to a filename that does not exist. + + A benefit of asserting nonexistence through NSEC records instead of + through NXDOMAIN responses is that NSEC records can be added to the + Additional Section of a DNS response to offer additional information + beyond what the querier explicitly requested. For example, in + response to an SRV query, a responder should include A record(s) + giving its IPv4 addresses in the Additional Section, and an NSEC + record indicating which other types it does or does not have for this + name. If the responder is running on a host that does not support + IPv6 (or does support IPv6 but currently has no IPv6 address on that + interface) then this NSEC record in the Additional Section will + indicate this absence of AAAA records. In effect, the responder is + saying, "Here's my SRV record, and here are my IPv4 addresses, and + no, I don't have any IPv6 addresses, so don't waste your time + asking". Without this information in the Additional Section, it + would take the querier an additional round-trip to perform an + additional query to ascertain that the target host has no AAAA + records. (Arguably Unicast DNS could also benefit from this ability + to express nonexistence in the Additional Section, but that is + outside the scope of this document.) + +Appendix F. Use of UTF-8 + + After many years of debate, as a result of the perceived need to + accommodate certain DNS implementations that apparently couldn't + handle any character that's not a letter, digit, or hyphen (and + apparently never would be updated to remedy this limitation), the + Unicast DNS community settled on an extremely baroque encoding called + "Punycode" [RFC3492]. Punycode is a remarkably ingenious encoding + solution, but it is complicated, hard to understand, and hard to + implement, using sophisticated techniques including insertion unsort + coding, generalized variable-length integers, and bias adaptation. + The resulting encoding is remarkably compact given the constraints, + but it's still not as good as simple straightforward UTF-8, and it's + hard even to predict whether a given input string will encode to a + Punycode string that fits within DNS's 63-byte limit, except by + simply trying the encoding and seeing whether it fits. Indeed, the + encoded size depends not only on the input characters, but on the + order they appear, so the same set of characters may or may not + encode to a legal Punycode string that fits within DNS's 63-byte + limit, depending on the order the characters appear. This is + extremely hard to present in a user interface that explains to users + why one name is allowed, but another name containing the exact same + characters is not. Neither Punycode nor any other of the "ASCII- + Compatible Encodings" [RFC5890] proposed for Unicast DNS may be used + + + +Cheshire & Krochmal Standards Track [Page 66] + +RFC 6762 Multicast DNS February 2013 + + + in Multicast DNS messages. Any text being represented internally in + some other representation must be converted to canonical precomposed + UTF-8 before being placed in any Multicast DNS message. + +Appendix G. Private DNS Namespaces + + The special treatment of names ending in ".local." has been + implemented in Macintosh computers since the days of Mac OS 9, and + continues today in Mac OS X and iOS. There are also implementations + for Microsoft Windows [B4W], Linux, and other platforms. + + Some network operators setting up private internal networks + ("intranets") have used unregistered top-level domains, and some may + have used the ".local" top-level domain. Using ".local" as a private + top-level domain conflicts with Multicast DNS and may cause problems + for users. Clients can be configured to send both Multicast and + Unicast DNS queries in parallel for these names, and this does allow + names to be looked up both ways, but this results in additional + network traffic and additional delays in name resolution, as well as + potentially creating user confusion when it is not clear whether any + given result was received via link-local multicast from a peer on the + same link, or from the configured unicast name server. Because of + this, we recommend against using ".local" as a private Unicast DNS + top-level domain. We do not recommend use of unregistered top-level + domains at all, but should network operators decide to do this, the + following top-level domains have been used on private internal + networks without the problems caused by trying to reuse ".local." for + this purpose: + + .intranet. + .internal. + .private. + .corp. + .home. + .lan. + +Appendix H. Deployment History + + In July 1997, in an email to the net-thinkers@thumper.vmeng.com + mailing list, Stuart Cheshire first proposed the idea of running the + AppleTalk Name Binding Protocol [RFC6760] over IP. As a result of + this and related IETF discussions, the IETF Zeroconf working group + was chartered September 1999. After various working group + discussions and other informal IETF discussions, several Internet- + Drafts were written that were loosely related to the general themes + of DNS and multicast, but did not address the service discovery + aspect of NBP. + + + + +Cheshire & Krochmal Standards Track [Page 67] + +RFC 6762 Multicast DNS February 2013 + + + In April 2000, Stuart Cheshire registered IPv4 multicast address + 224.0.0.251 with IANA [MC4] and began writing code to test and + develop the idea of performing NBP-like service discovery using + Multicast DNS, which was documented in a group of three Internet- + Drafts: + + o "Requirements for a Protocol to Replace the AppleTalk Name Binding + Protocol (NBP)" [RFC6760] is an overview explaining the AppleTalk + Name Binding Protocol, because many in the IETF community had + little first-hand experience using AppleTalk, and confusion in the + IETF community about what AppleTalk NBP did was causing confusion + about what would be required in an IP-based replacement. + + o "Discovering Named Instances of Abstract Services using DNS" [NIAS] + proposed a way to perform NBP-like service discovery using DNS- + compatible names and record types. + + o "Multicast DNS" (this document) specifies a way to transport those + DNS-compatible queries and responses using IP multicast, for zero- + configuration environments where no conventional Unicast DNS server + was available. + + In 2001, an update to Mac OS 9 added resolver library support for + host name lookup using Multicast DNS. If the user typed a name such + as "MyPrinter.local." into any piece of networking software that used + the standard Mac OS 9 name lookup APIs, then those name lookup APIs + would recognize the name as a dot-local name and query for it by + sending simple one-shot Multicast DNS queries to 224.0.0.251:5353. + This enabled the user to, for example, enter the name + "MyPrinter.local." into their web browser in order to view a + printer's status and configuration web page, or enter the name + "MyPrinter.local." into the printer setup utility to create a print + queue for printing documents on that printer. + + Multicast DNS responder software, with full service discovery, first + began shipping to end users in volume with the launch of Mac OS X + 10.2 "Jaguar" in August 2002, and network printer makers (who had + historically supported AppleTalk in their network printers and were + receptive to IP-based technologies that could offer them similar + ease-of-use) started adopting Multicast DNS shortly thereafter. + + In September 2002, Apple released the source code for the + mDNSResponder daemon as Open Source under Apple's standard Apple + Public Source License (APSL). + + Multicast DNS responder software became available for Microsoft + Windows users in June 2004 with the launch of Apple's "Rendezvous for + Windows" (now "Bonjour for Windows"), both in executable form (a + + + +Cheshire & Krochmal Standards Track [Page 68] + +RFC 6762 Multicast DNS February 2013 + + + downloadable installer for end users) and as Open Source (one of the + supported platforms within Apple's body of cross-platform code in the + publicly accessible mDNSResponder CVS source code repository) [BJ]. + + In August 2006, Apple re-licensed the cross-platform mDNSResponder + source code under the Apache License, Version 2.0. + + In addition to desktop and laptop computers running Mac OS X and + Microsoft Windows, Multicast DNS is now implemented in a wide range + of hardware devices, such as Apple's "AirPort" wireless base + stations, iPhone and iPad, and in home gateways from other vendors, + network printers, network cameras, TiVo DVRs, etc. + + The Open Source community has produced many independent + implementations of Multicast DNS, some in C like Apple's + mDNSResponder daemon, and others in a variety of different languages + including Java, Python, Perl, and C#/Mono. + + In January 2007, the IETF published the Informational RFC "Link-Local + Multicast Name Resolution (LLMNR)" [RFC4795], which is substantially + similar to Multicast DNS, but incompatible in some small but + important ways. In particular, the LLMNR design explicitly excluded + support for service discovery, which made it an unsuitable candidate + for a protocol to replace AppleTalk NBP [RFC6760]. + + While the original focus of Multicast DNS and DNS-Based Service + Discovery was for zero-configuration environments without a + conventional Unicast DNS server, DNS-Based Service Discovery also + works using Unicast DNS servers, using DNS Update [RFC2136] [RFC3007] + to create service discovery records and standard DNS queries to query + for them. Apple's Back to My Mac service, launched with Mac OS X + 10.5 "Leopard" in October 2007, uses DNS-Based Service Discovery over + Unicast DNS [RFC6281]. + + In June 2012, Google's Android operating system added native support + for DNS-SD and Multicast DNS with the android.net.nsd.NsdManager + class in Android 4.1 "Jelly Bean" (API Level 16) [NSD]. + + + + + + + + + + + + + + +Cheshire & Krochmal Standards Track [Page 69] + +RFC 6762 Multicast DNS February 2013 + + +Authors' Addresses + + Stuart Cheshire + Apple Inc. + 1 Infinite Loop + Cupertino, CA 95014 + USA + + Phone: +1 408 974 3207 + EMail: cheshire@apple.com + + + Marc Krochmal + Apple Inc. + 1 Infinite Loop + Cupertino, CA 95014 + USA + + Phone: +1 408 974 4368 + EMail: marc@apple.com + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Cheshire & Krochmal Standards Track [Page 70] + diff --git a/include/ZT1Service.h b/include/ZT1Service.h index 5cd0694..a55c088 100644 --- a/include/ZT1Service.h +++ b/include/ZT1Service.h @@ -103,7 +103,7 @@ void disableTaps(); * @param addrlen * @return */ -void zts_get_ipv4_address(const char *nwid, char *addrstr, const int addrlen); +void zts_get_ipv4_address(const char *nwid, char *addrstr, const size_t addrlen); /** * @brief Gets the VirtualTap's (interface) IPv6 address @@ -114,7 +114,7 @@ void zts_get_ipv4_address(const char *nwid, char *addrstr, const int addrlen); * @param addrlen * @return */ -void zts_get_ipv6_address(const char *nwid, char *addrstr, const int addrlen); +void zts_get_ipv6_address(const char *nwid, char *addrstr, const size_t addrlen); /** * @brief Returns whether the VirtualTap has an assigned IPv4 address @@ -198,9 +198,9 @@ int zts_running(); * that one call this at the beginning of your application code since it may take several seconds to fully * come online. * @param path Where this instance of ZeroTier will store its identity and configuration files - * @return + * @return Returns 1 if ZeroTier is currently running, and 0 if it is not */ -void zts_start(const char *path); +int zts_start(const char *path); /** * @brief Alternative to zts_start(). Start an instance of libzt, wait for an address to be issues, and join @@ -211,15 +211,15 @@ void zts_start(const char *path); * come online. * @param path * @param nwid A 16-digit hexidecimal virtual network ID - * @return + * @return Returns 0 on success, -1 on failure */ -void zts_simple_start(const char *path, const char *nwid); +int zts_simple_start(const char *path, const char *nwid); /** * @brief Stops libzt (ZeroTier core services, stack drivers, stack threads, etc) * * @usage This should be called at the end of your program or when you do not anticipate communicating over ZeroTier - * @return + * @return Returns 0 on success, -1 on failure */ void zts_stop(); @@ -231,7 +231,7 @@ void zts_stop(); * @param len * @return */ -void zts_get_homepath(char *homePath, int len); +void zts_get_homepath(char *homePath, size_t len); /** * @brief Copies the hexidecimal representation of this nodeID into the provided buffer diff --git a/include/libzt.h b/include/libzt.h index 5fa678b..4cd5471 100644 --- a/include/libzt.h +++ b/include/libzt.h @@ -53,7 +53,6 @@ extern "C" { #endif // forward declarations from ZT1Service.h -void zts_simple_start(const char *path, const char *nwid); int zts_get_device_id(char *devID); void init_network_stack(); @@ -71,7 +70,7 @@ void init_network_stack(); * @param nwid A 16-digit hexidecimal network identifier (e.g. Earth: `8056c2e21c000001`) * @return 0 if successful; or 1 if failed */ -void zts_start(const char *path); +int zts_start(const char *path); /** * @brief Starts libzt @@ -86,7 +85,7 @@ void zts_start(const char *path); * @param nwid A 16-digit hexidecimal network identifier (e.g. Earth: `8056c2e21c000001`) * @return 0 if successful; or 1 if failed */ -void zts_simple_start(const char *path, const char *nwid); +int zts_simple_start(const char *path, const char *nwid); /** * @brief Stops the ZeroTier core service and disconnects from all virtual networks @@ -143,7 +142,7 @@ void zts_leave_soft(const char * filepath, const char * nwid); * @param len * @return */ -void zts_get_homepath(char *homePath, const int len); +void zts_get_homepath(char *homePath, const size_t len); /** * @brief Get device ID (10-digit hex + NULL byte) @@ -198,7 +197,7 @@ int zts_has_address(const char *nwid); * @param addrlen * @return */ -void zts_get_ipv4_address(const char *nwid, char *addrstr, const int addrlen); +void zts_get_ipv4_address(const char *nwid, char *addrstr, const size_t addrlen); /** * @brief Get IPV6 Address for this device on a given network @@ -209,7 +208,7 @@ void zts_get_ipv4_address(const char *nwid, char *addrstr, const int addrlen); * @param addrlen * @return */ -void zts_get_ipv6_address(const char *nwid, char *addrstr, const int addrlen); +void zts_get_ipv6_address(const char *nwid, char *addrstr, const size_t addrlen); /** * @brief Returns a 6PLANE IPv6 address given a network ID and zerotier ID diff --git a/src/VirtualTap.hpp b/src/VirtualTap.hpp index 6ad6a7c..dd61e5c 100644 --- a/src/VirtualTap.hpp +++ b/src/VirtualTap.hpp @@ -206,7 +206,7 @@ namespace ZeroTier { char vtap_abbr_name[16]; static int devno; - int ifindex = 0; + size_t ifindex = 0; std::vector ips() const; std::vector _ips; diff --git a/src/ZT1Service.cpp b/src/ZT1Service.cpp index 085ba4a..b8dd44c 100644 --- a/src/ZT1Service.cpp +++ b/src/ZT1Service.cpp @@ -258,7 +258,7 @@ void disableTaps() ZeroTier::_vtaps_lock.unlock(); } -void zts_get_ipv4_address(const char *nwid, char *addrstr, const int addrlen) +void zts_get_ipv4_address(const char *nwid, char *addrstr, const size_t addrlen) { if (ZeroTier::zt1Service) { uint64_t nwid_int = strtoull(nwid, NULL, 16); @@ -280,7 +280,7 @@ void zts_get_ipv4_address(const char *nwid, char *addrstr, const int addrlen) memcpy(addrstr, "\0", 1); } -void zts_get_ipv6_address(const char *nwid, char *addrstr, const int addrlen) +void zts_get_ipv6_address(const char *nwid, char *addrstr, size_t addrlen) { if (ZeroTier::zt1Service) { uint64_t nwid_int = strtoull(nwid, NULL, 16); @@ -391,10 +391,10 @@ int zts_running() { return ZeroTier::zt1Service == NULL ? false : ZeroTier::zt1Service->isRunning(); } -void zts_start(const char *path) +int zts_start(const char *path) { if (ZeroTier::zt1Service) { - return; + return 0; // already initialized, ok } if (path) { ZeroTier::homeDir = path; @@ -403,12 +403,12 @@ void zts_start(const char *path) WSAStartup(MAKEWORD(2, 2), &wsaData); // initialize WinSock. Used in Phy for loopback pipe #endif pthread_t service_thread; - pthread_create(&service_thread, NULL, zts_start_service, NULL); + return pthread_create(&service_thread, NULL, zts_start_service, NULL); } -void zts_simple_start(const char *path, const char *nwid) +int zts_simple_start(const char *path, const char *nwid) { - zts_start(path); + int err = zts_start(path); while (zts_running() == false) { DEBUG_EXTRA("waiting for service to start"); nanosleep((const struct timespec[]) {{0, (ZTO_WRAPPER_CHECK_INTERVAL * 1000000)}}, NULL); @@ -426,6 +426,7 @@ void zts_simple_start(const char *path, const char *nwid) while (zts_has_address(nwid) == false) { nanosleep((const struct timespec[]) {{0, (ZTO_WRAPPER_CHECK_INTERVAL * 1000000)}}, NULL); } + return err; } void zts_stop() { @@ -438,10 +439,10 @@ void zts_stop() { #endif } -void zts_get_homepath(char *homePath, int len) { +void zts_get_homepath(char *homePath, size_t len) { if (ZeroTier::homeDir.length()) { memset(homePath, 0, len); - int buf_len = len < ZeroTier::homeDir.length() ? len : ZeroTier::homeDir.length(); + size_t buf_len = len < ZeroTier::homeDir.length() ? len : ZeroTier::homeDir.length(); memcpy(homePath, ZeroTier::homeDir.c_str(), buf_len); } }