diff --git a/Makefile b/Makefile
index b962c2e..57a3b6b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,11 @@
-source = parse.cpp parserstate.cpp parser.cpp templates.cpp terminal.cpp termemu.cpp parseraction.cpp terminalfunctions.cpp swrite.cpp terminalframebuffer.cpp terminaldispatcher.cpp terminaluserinput.cpp terminaldisplay.cpp network.cpp ntester.cpp
-objects = parserstate.o parser.o templates.o terminal.o parseraction.o terminalfunctions.o swrite.o terminalframebuffer.o terminaldispatcher.o terminaluserinput.o terminaldisplay.o network.o
+source = parse.cpp parserstate.cpp parser.cpp templates.cpp terminal.cpp termemu.cpp parseraction.cpp terminalfunctions.cpp swrite.cpp terminalframebuffer.cpp terminaldispatcher.cpp terminaluserinput.cpp terminaldisplay.cpp network.cpp ntester.cpp ocb.cpp base64.cpp encrypt.cpp decrypt.cpp crypto.cpp
+objects = parserstate.o parser.o templates.o terminal.o parseraction.o terminalfunctions.o swrite.o terminalframebuffer.o terminaldispatcher.o terminaluserinput.o terminaldisplay.o network.o ocb.o base64.o crypto.o
 repos = templates.rpo
-executables = parse termemu ntester
+executables = parse termemu ntester encrypt decrypt
 
 CXX = g++
 CXXFLAGS = -g --std=c++0x -pedantic -Werror -Wall -Wextra -Weffc++ -fno-implicit-templates -fno-default-inline -pipe -D_FILE_OFFSET_BITS=64 -D_XOPEN_SOURCE=500 -D_GNU_SOURCE
-LIBS = -lutil
+LIBS = -lutil -lssl
 
 all: $(executables)
 
@@ -18,6 +18,12 @@ termemu: termemu.o $(objects) parse # serialize link steps because of -frepo
 ntester: ntester.o $(objects) termemu # serialize link steps because of -frepo
 	$(CXX) $(CXXFLAGS) -o $@ ntester.o $(objects) $(LIBS)
 
+encrypt: encrypt.o $(objects) ntester # serialize link steps because of -frepo
+	$(CXX) $(CXXFLAGS) -o $@ encrypt.o $(objects) $(LIBS)
+
+decrypt: decrypt.o $(objects) encrypt # serialize link steps because of -frepo
+	$(CXX) $(CXXFLAGS) -o $@ decrypt.o $(objects) $(LIBS)
+
 templates.o: templates.cpp
 	$(CXX) $(CXXFLAGS) -frepo -c -o $@ $<
 
diff --git a/ae.hpp b/ae.hpp
new file mode 100644
index 0000000..fb5c511
--- /dev/null
+++ b/ae.hpp
@@ -0,0 +1,182 @@
+/* ---------------------------------------------------------------------------
+ *
+ * AEAD API 0.12 - 13 July 2011
+ *
+ * This file gives an interface appropriate for many authenticated
+ * encryption with associated data (AEAD) implementations. It does not try
+ * to accommodate all possible options or limitations that an implementation
+ * might have -- you should consult the documentation of your chosen
+ * implementation to find things like RFC 5116 constants, alignment
+ * requirements, whether the incremental interface is supported, etc.
+ *
+ * This file is in the public domain. It is provided "as is", without
+ * warranty of any kind. Use at your own risk.
+ *
+ * Comments are welcome: Ted Krovetz <ted@krovetz>.
+ *
+ * ------------------------------------------------------------------------ */
+
+#ifndef _AE_H_
+#define _AE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --------------------------------------------------------------------------
+ *
+ * Constants
+ *
+ * ----------------------------------------------------------------------- */
+
+/* Return status codes: Negative return values indicate an error occurred.
+ * For full explanations of error values, consult the implementation's
+ * documentation.                                                          */
+#define AE_SUCCESS       ( 0)  /* Indicates successful completion of call  */
+#define AE_INVALID       (-1)  /* Indicates bad tag during decryption      */
+#define AE_NOT_SUPPORTED (-2)  /* Indicates unsupported option requested   */
+
+/* Flags: When data can be processed "incrementally", these flags are used
+ * to indicate whether the submitted data is the last or not.               */
+#define AE_FINALIZE      (1)   /* This is the last of data                  */
+#define AE_PENDING       (0)   /* More data of is coming                    */
+
+/* --------------------------------------------------------------------------
+ *
+ * AEAD opaque structure definition
+ *
+ * ----------------------------------------------------------------------- */
+
+typedef struct _ae_ctx ae_ctx;
+
+/* --------------------------------------------------------------------------
+ *
+ * Data Structure Routines
+ *
+ * ----------------------------------------------------------------------- */
+
+ae_ctx* ae_allocate  (void *misc);  /* Allocate ae_ctx, set optional ptr   */
+void    ae_free      (ae_ctx *ctx); /* Deallocate ae_ctx struct            */
+int     ae_clear     (ae_ctx *ctx); /* Undo initialization                 */
+int     ae_ctx_sizeof(void);        /* Return sizeof(ae_ctx)               */
+/* ae_allocate() allocates an ae_ctx structure, but does not initialize it.
+ * ae_free() deallocates an ae_ctx structure, but does not zeroize it.
+ * ae_clear() zeroes sensitive values associated with an ae_ctx structure
+ * and deallocates any auxiliary structures allocated during ae_init().
+ * ae_ctx_sizeof() returns sizeof(ae_ctx), to aid in any static allocations.
+ */
+
+/* --------------------------------------------------------------------------
+ *
+ * AEAD Routines
+ *
+ * ----------------------------------------------------------------------- */
+
+int ae_init(ae_ctx     *ctx,
+            const void *key,
+            int         key_len,
+            int         nonce_len,
+            int         tag_len);
+/* --------------------------------------------------------------------------
+ *
+ * Initialize an ae_ctx context structure.
+ *
+ * Parameters:
+ *  ctx       - Pointer to an ae_ctx structure to be initialized
+ *  key       - Pointer to user-supplied key
+ *  key_len   - Length of key supplied, in bytes
+ *  nonce_len - Length of nonces to be used for this key, in bytes
+ *  tag_len   - Length of tags to be produced for this key, in bytes
+ *
+ * Returns:
+ *  AE_SUCCESS       - Success. Ctx ready for use.
+ *  AE_NOT_SUPPORTED - An unsupported length was supplied. Ctx is untouched.
+ *  Otherwise        - Error. Check implementation documentation for codes.
+ *
+ * ----------------------------------------------------------------------- */
+
+int ae_encrypt(ae_ctx     *ctx,
+               const void *nonce,
+               const void *pt,
+               int         pt_len,
+               const void *ad,
+               int         ad_len,
+               void       *ct,
+               void       *tag,
+               int         final);
+/* --------------------------------------------------------------------------
+ *
+ * Encrypt plaintext; provide for authentication of ciphertext/associated data.
+ *
+ * Parameters:
+ *  ctx    - Pointer to an ae_ctx structure initialized by ae_init.
+ *  nonce  - Pointer to a nonce_len (defined in ae_init) byte nonce.
+ *  pt     - Pointer to plaintext bytes to be encrypted.
+ *  pt_len - number of bytes pointed to by pt.
+ *  ad     - Pointer to associated data.
+ *  ad_len - number of bytes pointed to by ad.
+ *  ct     - Pointer to buffer to receive ciphertext encryption.
+ *  tag    - Pointer to receive authentication tag; or NULL
+ *           if tag is to be bundled into the ciphertext.
+ *  final  - Non-zero if this call completes the plaintext being encrypted.
+ *
+ * If nonce!=NULL then a message is being initiated. If final!=0
+ * then a message is being finalized. If final==0 or nonce==NULL
+ * then the incremental interface is being used. If nonce!=NULL and
+ * ad_len<0, then use same ad as last message.
+ *
+ * Returns:
+ *  non-negative     - Number of bytes written to ct.
+ *  AE_NOT_SUPPORTED - Usage mode unsupported (eg, incremental and/or sticky).
+ *  Otherwise        - Error. Check implementation documentation for codes.
+ *
+ * ----------------------------------------------------------------------- */
+
+int ae_decrypt(ae_ctx     *ctx,
+               const void *nonce,
+               const void *ct,
+               int         ct_len,
+               const void *ad,
+               int         ad_len,
+               void       *pt,
+               const void *tag,
+               int         final);
+/* --------------------------------------------------------------------------
+ *
+ * Decrypt ciphertext; provide authenticity of plaintext and associated data.
+ *
+ * Parameters:
+ *  ctx    - Pointer to an ae_ctx structure initialized by ae_init.
+ *  nonce  - Pointer to a nonce_len (defined in ae_init) byte nonce.
+ *  ct     - Pointer to ciphertext bytes to be decrypted.
+ *  ct_len - number of bytes pointed to by ct.
+ *  ad     - Pointer to associated data.
+ *  ad_len - number of bytes pointed to by ad.
+ *  pt     - Pointer to buffer to receive plaintext decryption.
+ *  tag    - Pointer to tag_len (defined in ae_init) bytes; or NULL
+ *           if tag is bundled into the ciphertext. Non-NULL tag is only
+ *           read when final is non-zero.
+ *  final  - Non-zero if this call completes the ciphertext being decrypted.
+ *
+ * If nonce!=NULL then "ct" points to the start of a ciphertext. If final!=0
+ * then "in" points to the final piece of ciphertext. If final==0 or nonce==
+ * NULL then the incremental interface is being used. If nonce!=NULL and
+ * ad_len<0, then use same ad as last message.
+ *
+ * Returns:
+ *  non-negative     - Number of bytes written to pt.
+ *  AE_INVALID       - Authentication failure.
+ *  AE_NOT_SUPPORTED - Usage mode unsupported (eg, incremental and/or sticky).
+ *  Otherwise        - Error. Check implementation documentation for codes.
+ *
+ * NOTE !!! NOTE !!! -- The ciphertext should be assumed possibly inauthentic
+ *                      until it has been completely written and it is
+ *                      verified that this routine did not return AE_INVALID.
+ *
+ * ----------------------------------------------------------------------- */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* _AE_H_ */
diff --git a/base64.cpp b/base64.cpp
new file mode 100644
index 0000000..028e2f4
--- /dev/null
+++ b/base64.cpp
@@ -0,0 +1,577 @@
+/* Taken from GNU coreutils */
+
+/* base64.c -- Encode binary data using printable characters.
+   Copyright (C) 1999-2001, 2004-2006, 2009-2011 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* Written by Simon Josefsson.  Partially adapted from GNU MailUtils
+ * (mailbox/filter_trans.c, as of 2004-11-28).  Improved by review
+ * from Paul Eggert, Bruno Haible, and Stepan Kasal.
+ *
+ * See also RFC 3548 <http://www.ietf.org/rfc/rfc3548.txt>.
+ *
+ * Be careful with error checking.  Here is how you would typically
+ * use these functions:
+ *
+ * bool ok = base64_decode_alloc (in, inlen, &out, &outlen);
+ * if (!ok)
+ *   FAIL: input was not valid base64
+ * if (out == NULL)
+ *   FAIL: memory allocation error
+ * OK: data in OUT/OUTLEN
+ *
+ * size_t outlen = base64_encode_alloc (in, inlen, &out);
+ * if (out == NULL && outlen == 0 && inlen != 0)
+ *   FAIL: input too long
+ * if (out == NULL)
+ *   FAIL: memory allocation error
+ * OK: data in OUT/OUTLEN.
+ *
+ */
+
+// #include <config.h>
+
+/* Get prototype. */
+#include "base64.h"
+
+/* Get malloc. */
+#include <stdlib.h>
+
+/* Get UCHAR_MAX. */
+#include <limits.h>
+
+#include <string.h>
+
+/* C89 compliant way to cast 'char' to 'unsigned char'. */
+static inline unsigned char
+to_uchar (char ch)
+{
+  return ch;
+}
+
+/* Base64 encode IN array of size INLEN into OUT array of size OUTLEN.
+   If OUTLEN is less than BASE64_LENGTH(INLEN), write as many bytes as
+   possible.  If OUTLEN is larger than BASE64_LENGTH(INLEN), also zero
+   terminate the output buffer. */
+void
+base64_encode (const char *restrict in, size_t inlen,
+               char *restrict out, size_t outlen)
+{
+  static const char b64str[65] = /* KJW */
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+  while (inlen && outlen)
+    {
+      *out++ = b64str[(to_uchar (in[0]) >> 2) & 0x3f];
+      if (!--outlen)
+        break;
+      *out++ = b64str[((to_uchar (in[0]) << 4)
+                       + (--inlen ? to_uchar (in[1]) >> 4 : 0))
+                      & 0x3f];
+      if (!--outlen)
+        break;
+      *out++ =
+        (inlen
+         ? b64str[((to_uchar (in[1]) << 2)
+                   + (--inlen ? to_uchar (in[2]) >> 6 : 0))
+                  & 0x3f]
+         : '=');
+      if (!--outlen)
+        break;
+      *out++ = inlen ? b64str[to_uchar (in[2]) & 0x3f] : '=';
+      if (!--outlen)
+        break;
+      if (inlen)
+        inlen--;
+      if (inlen)
+        in += 3;
+    }
+
+  if (outlen)
+    *out = '\0';
+}
+
+/* Allocate a buffer and store zero terminated base64 encoded data
+   from array IN of size INLEN, returning BASE64_LENGTH(INLEN), i.e.,
+   the length of the encoded data, excluding the terminating zero.  On
+   return, the OUT variable will hold a pointer to newly allocated
+   memory that must be deallocated by the caller.  If output string
+   length would overflow, 0 is returned and OUT is set to NULL.  If
+   memory allocation failed, OUT is set to NULL, and the return value
+   indicates length of the requested memory block, i.e.,
+   BASE64_LENGTH(inlen) + 1. */
+size_t
+base64_encode_alloc (const char *in, size_t inlen, char **out)
+{
+  size_t outlen = 1 + BASE64_LENGTH (inlen);
+
+  /* Check for overflow in outlen computation.
+   *
+   * If there is no overflow, outlen >= inlen.
+   *
+   * If the operation (inlen + 2) overflows then it yields at most +1, so
+   * outlen is 0.
+   *
+   * If the multiplication overflows, we lose at least half of the
+   * correct value, so the result is < ((inlen + 2) / 3) * 2, which is
+   * less than (inlen + 2) * 0.66667, which is less than inlen as soon as
+   * (inlen > 4).
+   */
+  if (inlen > outlen)
+    {
+      *out = NULL;
+      return 0;
+    }
+
+  *out = (char *) malloc (outlen); /* KJW */
+  if (!*out)
+    return outlen;
+
+  base64_encode (in, inlen, *out, outlen);
+
+  return outlen - 1;
+}
+
+/* With this approach this file works independent of the charset used
+   (think EBCDIC).  However, it does assume that the characters in the
+   Base64 alphabet (A-Za-z0-9+/) are encoded in 0..255.  POSIX
+   1003.1-2001 require that char and unsigned char are 8-bit
+   quantities, though, taking care of that problem.  But this may be a
+   potential problem on non-POSIX C99 platforms.
+
+   IBM C V6 for AIX mishandles "#define B64(x) ...'x'...", so use "_"
+   as the formal parameter rather than "x".  */
+#define B64(_)                                  \
+  ((_) == 'A' ? 0                               \
+   : (_) == 'B' ? 1                             \
+   : (_) == 'C' ? 2                             \
+   : (_) == 'D' ? 3                             \
+   : (_) == 'E' ? 4                             \
+   : (_) == 'F' ? 5                             \
+   : (_) == 'G' ? 6                             \
+   : (_) == 'H' ? 7                             \
+   : (_) == 'I' ? 8                             \
+   : (_) == 'J' ? 9                             \
+   : (_) == 'K' ? 10                            \
+   : (_) == 'L' ? 11                            \
+   : (_) == 'M' ? 12                            \
+   : (_) == 'N' ? 13                            \
+   : (_) == 'O' ? 14                            \
+   : (_) == 'P' ? 15                            \
+   : (_) == 'Q' ? 16                            \
+   : (_) == 'R' ? 17                            \
+   : (_) == 'S' ? 18                            \
+   : (_) == 'T' ? 19                            \
+   : (_) == 'U' ? 20                            \
+   : (_) == 'V' ? 21                            \
+   : (_) == 'W' ? 22                            \
+   : (_) == 'X' ? 23                            \
+   : (_) == 'Y' ? 24                            \
+   : (_) == 'Z' ? 25                            \
+   : (_) == 'a' ? 26                            \
+   : (_) == 'b' ? 27                            \
+   : (_) == 'c' ? 28                            \
+   : (_) == 'd' ? 29                            \
+   : (_) == 'e' ? 30                            \
+   : (_) == 'f' ? 31                            \
+   : (_) == 'g' ? 32                            \
+   : (_) == 'h' ? 33                            \
+   : (_) == 'i' ? 34                            \
+   : (_) == 'j' ? 35                            \
+   : (_) == 'k' ? 36                            \
+   : (_) == 'l' ? 37                            \
+   : (_) == 'm' ? 38                            \
+   : (_) == 'n' ? 39                            \
+   : (_) == 'o' ? 40                            \
+   : (_) == 'p' ? 41                            \
+   : (_) == 'q' ? 42                            \
+   : (_) == 'r' ? 43                            \
+   : (_) == 's' ? 44                            \
+   : (_) == 't' ? 45                            \
+   : (_) == 'u' ? 46                            \
+   : (_) == 'v' ? 47                            \
+   : (_) == 'w' ? 48                            \
+   : (_) == 'x' ? 49                            \
+   : (_) == 'y' ? 50                            \
+   : (_) == 'z' ? 51                            \
+   : (_) == '0' ? 52                            \
+   : (_) == '1' ? 53                            \
+   : (_) == '2' ? 54                            \
+   : (_) == '3' ? 55                            \
+   : (_) == '4' ? 56                            \
+   : (_) == '5' ? 57                            \
+   : (_) == '6' ? 58                            \
+   : (_) == '7' ? 59                            \
+   : (_) == '8' ? 60                            \
+   : (_) == '9' ? 61                            \
+   : (_) == '+' ? 62                            \
+   : (_) == '/' ? 63                            \
+   : -1)
+
+static const signed char b64[0x100] = {
+  B64 (0), B64 (1), B64 (2), B64 (3),
+  B64 (4), B64 (5), B64 (6), B64 (7),
+  B64 (8), B64 (9), B64 (10), B64 (11),
+  B64 (12), B64 (13), B64 (14), B64 (15),
+  B64 (16), B64 (17), B64 (18), B64 (19),
+  B64 (20), B64 (21), B64 (22), B64 (23),
+  B64 (24), B64 (25), B64 (26), B64 (27),
+  B64 (28), B64 (29), B64 (30), B64 (31),
+  B64 (32), B64 (33), B64 (34), B64 (35),
+  B64 (36), B64 (37), B64 (38), B64 (39),
+  B64 (40), B64 (41), B64 (42), B64 (43),
+  B64 (44), B64 (45), B64 (46), B64 (47),
+  B64 (48), B64 (49), B64 (50), B64 (51),
+  B64 (52), B64 (53), B64 (54), B64 (55),
+  B64 (56), B64 (57), B64 (58), B64 (59),
+  B64 (60), B64 (61), B64 (62), B64 (63),
+  B64 (64), B64 (65), B64 (66), B64 (67),
+  B64 (68), B64 (69), B64 (70), B64 (71),
+  B64 (72), B64 (73), B64 (74), B64 (75),
+  B64 (76), B64 (77), B64 (78), B64 (79),
+  B64 (80), B64 (81), B64 (82), B64 (83),
+  B64 (84), B64 (85), B64 (86), B64 (87),
+  B64 (88), B64 (89), B64 (90), B64 (91),
+  B64 (92), B64 (93), B64 (94), B64 (95),
+  B64 (96), B64 (97), B64 (98), B64 (99),
+  B64 (100), B64 (101), B64 (102), B64 (103),
+  B64 (104), B64 (105), B64 (106), B64 (107),
+  B64 (108), B64 (109), B64 (110), B64 (111),
+  B64 (112), B64 (113), B64 (114), B64 (115),
+  B64 (116), B64 (117), B64 (118), B64 (119),
+  B64 (120), B64 (121), B64 (122), B64 (123),
+  B64 (124), B64 (125), B64 (126), B64 (127),
+  B64 (128), B64 (129), B64 (130), B64 (131),
+  B64 (132), B64 (133), B64 (134), B64 (135),
+  B64 (136), B64 (137), B64 (138), B64 (139),
+  B64 (140), B64 (141), B64 (142), B64 (143),
+  B64 (144), B64 (145), B64 (146), B64 (147),
+  B64 (148), B64 (149), B64 (150), B64 (151),
+  B64 (152), B64 (153), B64 (154), B64 (155),
+  B64 (156), B64 (157), B64 (158), B64 (159),
+  B64 (160), B64 (161), B64 (162), B64 (163),
+  B64 (164), B64 (165), B64 (166), B64 (167),
+  B64 (168), B64 (169), B64 (170), B64 (171),
+  B64 (172), B64 (173), B64 (174), B64 (175),
+  B64 (176), B64 (177), B64 (178), B64 (179),
+  B64 (180), B64 (181), B64 (182), B64 (183),
+  B64 (184), B64 (185), B64 (186), B64 (187),
+  B64 (188), B64 (189), B64 (190), B64 (191),
+  B64 (192), B64 (193), B64 (194), B64 (195),
+  B64 (196), B64 (197), B64 (198), B64 (199),
+  B64 (200), B64 (201), B64 (202), B64 (203),
+  B64 (204), B64 (205), B64 (206), B64 (207),
+  B64 (208), B64 (209), B64 (210), B64 (211),
+  B64 (212), B64 (213), B64 (214), B64 (215),
+  B64 (216), B64 (217), B64 (218), B64 (219),
+  B64 (220), B64 (221), B64 (222), B64 (223),
+  B64 (224), B64 (225), B64 (226), B64 (227),
+  B64 (228), B64 (229), B64 (230), B64 (231),
+  B64 (232), B64 (233), B64 (234), B64 (235),
+  B64 (236), B64 (237), B64 (238), B64 (239),
+  B64 (240), B64 (241), B64 (242), B64 (243),
+  B64 (244), B64 (245), B64 (246), B64 (247),
+  B64 (248), B64 (249), B64 (250), B64 (251),
+  B64 (252), B64 (253), B64 (254), B64 (255)
+};
+
+#if UCHAR_MAX == 255
+# define uchar_in_range(c) true
+#else
+# define uchar_in_range(c) ((c) <= 255)
+#endif
+
+/* Return true if CH is a character from the Base64 alphabet, and
+   false otherwise.  Note that '=' is padding and not considered to be
+   part of the alphabet.  */
+bool
+isbase64 (char ch)
+{
+  return uchar_in_range (to_uchar (ch)) && 0 <= b64[to_uchar (ch)];
+}
+
+/* Initialize decode-context buffer, CTX.  */
+void
+base64_decode_ctx_init (struct base64_decode_context *ctx)
+{
+  ctx->i = 0;
+}
+
+/* If CTX->i is 0 or 4, there are four or more bytes in [*IN..IN_END), and
+   none of those four is a newline, then return *IN.  Otherwise, copy up to
+   4 - CTX->i non-newline bytes from that range into CTX->buf, starting at
+   index CTX->i and setting CTX->i to reflect the number of bytes copied,
+   and return CTX->buf.  In either case, advance *IN to point to the byte
+   after the last one processed, and set *N_NON_NEWLINE to the number of
+   verified non-newline bytes accessible through the returned pointer.  */
+static inline char *
+get_4 (struct base64_decode_context *ctx,
+       char const *restrict *in, char const *restrict in_end,
+       size_t *n_non_newline)
+{
+  if (ctx->i == 4)
+    ctx->i = 0;
+
+  if (ctx->i == 0)
+    {
+      char const *t = *in;
+      if (4 <= in_end - *in && memchr (t, '\n', 4) == NULL)
+        {
+          /* This is the common case: no newline.  */
+          *in += 4;
+          *n_non_newline = 4;
+          return (char *) t;
+        }
+    }
+
+  {
+    /* Copy non-newline bytes into BUF.  */
+    char const *p = *in;
+    while (p < in_end)
+      {
+        char c = *p++;
+        if (c != '\n')
+          {
+            ctx->buf[ctx->i++] = c;
+            if (ctx->i == 4)
+              break;
+          }
+      }
+
+    *in = p;
+    *n_non_newline = ctx->i;
+    return ctx->buf;
+  }
+}
+
+#define return_false                            \
+  do                                            \
+    {                                           \
+      *outp = out;                              \
+      return false;                             \
+    }                                           \
+  while (false)
+
+/* Decode up to four bytes of base64-encoded data, IN, of length INLEN
+   into the output buffer, *OUT, of size *OUTLEN bytes.  Return true if
+   decoding is successful, false otherwise.  If *OUTLEN is too small,
+   as many bytes as possible are written to *OUT.  On return, advance
+   *OUT to point to the byte after the last one written, and decrement
+   *OUTLEN to reflect the number of bytes remaining in *OUT.  */
+static inline bool
+decode_4 (char const *restrict in, size_t inlen,
+          char *restrict *outp, size_t *outleft)
+{
+  char *out = *outp;
+  if (inlen < 2)
+    return false;
+
+  if (!isbase64 (in[0]) || !isbase64 (in[1]))
+    return false;
+
+  if (*outleft)
+    {
+      *out++ = ((b64[to_uchar (in[0])] << 2)
+                | (b64[to_uchar (in[1])] >> 4));
+      --*outleft;
+    }
+
+  if (inlen == 2)
+    return_false;
+
+  if (in[2] == '=')
+    {
+      if (inlen != 4)
+        return_false;
+
+      if (in[3] != '=')
+        return_false;
+    }
+  else
+    {
+      if (!isbase64 (in[2]))
+        return_false;
+
+      if (*outleft)
+        {
+          *out++ = (((b64[to_uchar (in[1])] << 4) & 0xf0)
+                    | (b64[to_uchar (in[2])] >> 2));
+          --*outleft;
+        }
+
+      if (inlen == 3)
+        return_false;
+
+      if (in[3] == '=')
+        {
+          if (inlen != 4)
+            return_false;
+        }
+      else
+        {
+          if (!isbase64 (in[3]))
+            return_false;
+
+          if (*outleft)
+            {
+              *out++ = (((b64[to_uchar (in[2])] << 6) & 0xc0)
+                        | b64[to_uchar (in[3])]);
+              --*outleft;
+            }
+        }
+    }
+
+  *outp = out;
+  return true;
+}
+
+/* Decode base64-encoded input array IN of length INLEN to output array
+   OUT that can hold *OUTLEN bytes.  The input data may be interspersed
+   with newlines.  Return true if decoding was successful, i.e. if the
+   input was valid base64 data, false otherwise.  If *OUTLEN is too
+   small, as many bytes as possible will be written to OUT.  On return,
+   *OUTLEN holds the length of decoded bytes in OUT.  Note that as soon
+   as any non-alphabet, non-newline character is encountered, decoding
+   is stopped and false is returned.  If INLEN is zero, then process
+   only whatever data is stored in CTX.
+
+   Initially, CTX must have been initialized via base64_decode_ctx_init.
+   Subsequent calls to this function must reuse whatever state is recorded
+   in that buffer.  It is necessary for when a quadruple of base64 input
+   bytes spans two input buffers.
+
+   If CTX is NULL then newlines are treated as garbage and the input
+   buffer is processed as a unit.  */
+
+bool
+base64_decode_ctx (struct base64_decode_context *ctx,
+                   const char *restrict in, size_t inlen,
+                   char *restrict out, size_t *outlen)
+{
+  size_t outleft = *outlen;
+  bool ignore_newlines = ctx != NULL;
+  bool flush_ctx = false;
+  unsigned int ctx_i = 0;
+
+  if (ignore_newlines)
+    {
+      ctx_i = ctx->i;
+      flush_ctx = inlen == 0;
+    }
+
+
+  while (true)
+    {
+      size_t outleft_save = outleft;
+      if (ctx_i == 0 && !flush_ctx)
+        {
+          while (true)
+            {
+              /* Save a copy of outleft, in case we need to re-parse this
+                 block of four bytes.  */
+              outleft_save = outleft;
+              if (!decode_4 (in, inlen, &out, &outleft))
+                break;
+
+              in += 4;
+              inlen -= 4;
+            }
+        }
+
+      if (inlen == 0 && !flush_ctx)
+        break;
+
+      /* Handle the common case of 72-byte wrapped lines.
+         This also handles any other multiple-of-4-byte wrapping.  */
+      if (inlen && *in == '\n' && ignore_newlines)
+        {
+          ++in;
+          --inlen;
+          continue;
+        }
+
+      /* Restore OUT and OUTLEFT.  */
+      out -= outleft_save - outleft;
+      outleft = outleft_save;
+
+      {
+        char const *in_end = in + inlen;
+        char const *non_nl;
+
+        if (ignore_newlines)
+          non_nl = get_4 (ctx, &in, in_end, &inlen);
+        else
+          non_nl = in;  /* Might have nl in this case. */
+
+        /* If the input is empty or consists solely of newlines (0 non-newlines),
+           then we're done.  Likewise if there are fewer than 4 bytes when not
+           flushing context and not treating newlines as garbage.  */
+        if (inlen == 0 || (inlen < 4 && !flush_ctx && ignore_newlines))
+          {
+            inlen = 0;
+            break;
+          }
+        if (!decode_4 (non_nl, inlen, &out, &outleft))
+          break;
+
+        inlen = in_end - in;
+      }
+    }
+
+  *outlen -= outleft;
+
+  return inlen == 0;
+}
+
+/* Allocate an output buffer in *OUT, and decode the base64 encoded
+   data stored in IN of size INLEN to the *OUT buffer.  On return, the
+   size of the decoded data is stored in *OUTLEN.  OUTLEN may be NULL,
+   if the caller is not interested in the decoded length.  *OUT may be
+   NULL to indicate an out of memory error, in which case *OUTLEN
+   contains the size of the memory block needed.  The function returns
+   true on successful decoding and memory allocation errors.  (Use the
+   *OUT and *OUTLEN parameters to differentiate between successful
+   decoding and memory error.)  The function returns false if the
+   input was invalid, in which case *OUT is NULL and *OUTLEN is
+   undefined. */
+bool
+base64_decode_alloc_ctx (struct base64_decode_context *ctx,
+                         const char *in, size_t inlen, char **out,
+                         size_t *outlen)
+{
+  /* This may allocate a few bytes too many, depending on input,
+     but it's not worth the extra CPU time to compute the exact size.
+     The exact size is 3 * inlen / 4, minus 1 if the input ends
+     with "=" and minus another 1 if the input ends with "==".
+     Dividing before multiplying avoids the possibility of overflow.  */
+  size_t needlen = 3 * (inlen / 4) + 2;
+
+  *out = (char *) malloc (needlen);
+  if (!*out)
+    return true;
+
+  if (!base64_decode_ctx (ctx, in, inlen, *out, &needlen))
+    {
+      free (*out);
+      *out = NULL;
+      return false;
+    }
+
+  if (outlen)
+    *outlen = needlen;
+
+  return true;
+}
diff --git a/base64.h b/base64.h
new file mode 100644
index 0000000..8efa678
--- /dev/null
+++ b/base64.h
@@ -0,0 +1,65 @@
+/* Taken from GNU coreutils */
+
+#define restrict
+
+/* base64.h -- Encode binary data using printable characters.
+   Copyright (C) 2004-2006, 2009-2011 Free Software Foundation, Inc.
+   Written by Simon Josefsson.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef BASE64_H
+# define BASE64_H
+
+/* Get size_t. */
+# include <stddef.h>
+
+/* Get bool. */
+# include <stdbool.h>
+
+/* This uses that the expression (n+(k-1))/k means the smallest
+   integer >= n/k, i.e., the ceiling of n/k.  */
+# define BASE64_LENGTH(inlen) ((((inlen) + 2) / 3) * 4)
+
+struct base64_decode_context
+{
+  unsigned int i;
+  char buf[4];
+};
+
+extern bool isbase64 (char ch);
+
+extern void base64_encode (const char *restrict in, size_t inlen,
+                           char *restrict out, size_t outlen);
+
+extern size_t base64_encode_alloc (const char *in, size_t inlen, char **out);
+
+extern void base64_decode_ctx_init (struct base64_decode_context *ctx);
+
+extern bool base64_decode_ctx (struct base64_decode_context *ctx,
+                               const char *restrict in, size_t inlen,
+                               char *restrict out, size_t *outlen);
+
+extern bool base64_decode_alloc_ctx (struct base64_decode_context *ctx,
+                                     const char *in, size_t inlen,
+                                     char **out, size_t *outlen);
+
+#define base64_decode(in, inlen, out, outlen) \
+        base64_decode_ctx (NULL, in, inlen, out, outlen)
+
+#define base64_decode_alloc(in, inlen, out, outlen) \
+        base64_decode_alloc_ctx (NULL, in, inlen, out, outlen)
+
+#endif /* BASE64_H */
diff --git a/crypto.cpp b/crypto.cpp
new file mode 100644
index 0000000..6431328
--- /dev/null
+++ b/crypto.cpp
@@ -0,0 +1,211 @@
+#include <string.h>
+#include <stdio.h>
+
+#include "crypto.hpp"
+#include "base64.h"
+
+using namespace std;
+
+const char rdev[] = "/dev/urandom";
+
+static void * sse_alloc( int len )
+{
+  void *ptr = NULL;
+
+  if( (0 != posix_memalign( (void **)&ptr, 16, len )) || (ptr == NULL) ) {
+    throw std::bad_alloc();
+  }
+
+  return ptr;
+}
+
+Base64Key::Base64Key( string printable_key )
+{
+  if ( printable_key.length() != 22 ) {
+    throw CryptoException( "Key must be 22 letters long." );
+  }
+
+  string base64 = printable_key + "==";
+
+  size_t len = 16;
+  if ( !base64_decode( base64.data(), 24, (char *)&key[ 0 ], &len ) ) {
+    throw CryptoException( "Key must be well-formed base64." );
+  }
+
+  if ( len != 16 ) {
+    throw CryptoException( "Key must represent 16 octets." );
+  }
+
+  /* to catch changes after the first 128 bits */
+  if ( printable_key != this->printable_key() ) {
+    throw CryptoException( "Base64 key was not encoded 128-bit key." );
+  }
+}
+
+Base64Key::Base64Key()
+{
+  FILE *devrandom = fopen( rdev, "r" );
+  if ( devrandom == NULL ) {
+    throw CryptoException( string( rdev ) + ": " + strerror( errno ) );
+  }
+
+  if ( 1 != fread( key, 16, 1, devrandom ) ) {
+    throw CryptoException( "Could not read from " + string( rdev ) );
+  }
+
+  if ( 0 != fclose( devrandom ) ) {
+    throw CryptoException( string( rdev ) + ": " + strerror( errno ) );
+  }
+}
+
+string Base64Key::printable_key( void )
+{
+  char base64[ 25 ];
+  
+  base64_encode( (char *)key, 16, base64, 25 );
+
+  if ( (base64[ 24 ] != 0)
+       || (base64[ 23 ] != '=')
+       || (base64[ 22 ] != '=') ) {
+    throw CryptoException( "Unexpected output from base64_encode." );
+  }
+
+  base64[ 22 ] = 0;
+  return string( base64 );
+}
+
+Session::Session( Base64Key s_key )
+  : key( s_key ), ctx( NULL )
+{
+  ctx = ae_allocate( NULL );
+  if ( ctx == NULL ) {
+    throw CryptoException( "Could not allocate AES-OCB context." );
+  }
+
+  if ( AE_SUCCESS != ae_init( ctx, key.data(), 16, 12, 16 ) ) {
+    throw CryptoException( "Could not initialize AES-OCB context." );
+  }
+}
+
+Session::~Session()
+{
+  if ( ae_clear( ctx ) != AE_SUCCESS ) {
+    throw CryptoException( "Could not clear AES-OCB context." );
+  }
+
+  ae_free( ctx );
+}
+
+Nonce::Nonce( uint64_t val )
+{
+  uint64_t val_net = htobe64( val );
+
+  memset( bytes, 0, 4 );
+  memcpy( bytes + 4, &val_net, 8 );
+}
+
+uint64_t Nonce::val( void )
+{
+  uint64_t ret;
+  memcpy( &ret, bytes + 4, 8 );
+  return be64toh( ret );
+}
+
+Nonce::Nonce( char *s_bytes, size_t len )
+{
+  if ( len != 8 ) {
+    throw CryptoException( "Nonce representation must be 8 octets long." );
+  }
+
+  memset( bytes, 0, 4 );
+  memcpy( bytes + 4, s_bytes, 8 );
+}
+
+Message::Message( char *nonce_bytes, size_t nonce_len,
+		  char *text_bytes, size_t text_len )
+  : nonce( nonce_bytes, nonce_len ),
+    text( (char *)text_bytes, text_len )
+{}
+
+Message::Message( Nonce s_nonce, string s_text )
+  : nonce( s_nonce ),
+    text( s_text )
+{}
+
+string Session::encrypt( Message plaintext )
+{
+  const size_t pt_len = plaintext.text.size();
+  const int ciphertext_len = pt_len + 16;
+
+  char *ciphertext = (char *)sse_alloc( ciphertext_len );
+  char *pt = (char *)sse_alloc( pt_len );
+
+  memcpy( pt, plaintext.text.data(), plaintext.text.size() );
+
+  if ( (uint64_t( plaintext.nonce.data() ) & 0xf) != 0 ) {
+    throw CryptoException( "Bad alignment." );
+  }
+
+  if ( ciphertext_len != ae_encrypt( ctx,                                     /* ctx */
+				     plaintext.nonce.data(),                  /* nonce */
+				     pt,                                      /* pt */
+				     pt_len,                                  /* pt_len */
+				     NULL,                                    /* ad */
+				     0,                                       /* ad_len */
+				     ciphertext,                              /* ct */
+				     NULL,                                    /* tag */
+				     AE_FINALIZE ) ) {                        /* final */
+    free( pt );
+    free( ciphertext );
+    throw CryptoException( "ae_encrypt() returned error." );
+  }
+
+  string text( (char *)ciphertext, ciphertext_len );
+  free( pt );
+  free( ciphertext );
+
+  return plaintext.nonce.cpp_str() + text;
+}
+
+Message Session::decrypt( string ciphertext )
+{
+  if ( ciphertext.size() < 24 ) {
+    throw CryptoException( "Ciphertext must contain nonce and tag." );
+  }
+
+  char *str = (char *)ciphertext.data();
+
+  int body_len = ciphertext.size() - 8;
+  int pt_len = body_len - 16;
+
+  if ( pt_len <= 0 ) { /* super-assertion that does not equal AE_INVALID */
+    fprintf( stderr, "BUG.\n" );
+    exit( 1 );
+  }
+
+  Nonce __attribute__((__aligned__ (16))) nonce( str, 8 );
+  char *body = (char *)sse_alloc( body_len );
+  memcpy( body, str + 8, body_len );
+
+  char *plaintext = (char *)sse_alloc( pt_len );
+
+  if ( pt_len != ae_decrypt( ctx,               /* ctx */
+			     nonce.data(),      /* nonce */
+			     body,              /* ct */
+			     body_len,          /* ct_len */
+			     NULL,              /* ad */
+			     0,                 /* ad_len */
+			     plaintext,         /* pt */
+			     NULL,              /* tag */
+			     AE_FINALIZE ) ) {  /* final */
+    free( plaintext );
+    free( body );
+    throw CryptoException( "ae_decrypt() returned error." );
+  }
+
+  Message ret( nonce, string( plaintext, pt_len ) );
+  free( plaintext );
+  free( body );
+
+  return ret;
+}
diff --git a/crypto.hpp b/crypto.hpp
new file mode 100644
index 0000000..e3be49f
--- /dev/null
+++ b/crypto.hpp
@@ -0,0 +1,65 @@
+#ifndef CRYPTO_HPP
+#define CRYPTO_HPP
+
+#include "ae.hpp"
+#include <string>
+
+using namespace std;
+
+class CryptoException {
+public:
+  string text;
+  CryptoException( string s_text ) : text( s_text ) {};
+};
+
+class Base64Key {
+private:
+  unsigned char key[ 16 ];
+
+public:
+  Base64Key(); /* random key */
+  Base64Key( string printable_key );
+  string printable_key( void );
+  unsigned char *data( void ) { return key; }
+};
+
+class Nonce {
+private:
+  char bytes[ 12 ];
+
+public:
+  Nonce( uint64_t val );
+  Nonce( char *s_bytes, size_t len );
+
+  string cpp_str( void ) { return string( (char *)( bytes + 4 ), 8 ); }
+  char *data( void ) { return bytes; }
+  uint64_t val( void );
+};
+
+class Message {
+public:
+  Nonce nonce;
+  string text;
+
+  Message( char *nonce_bytes, size_t nonce_len,
+	   char *text_bytes, size_t text_len );
+  Message( Nonce s_nonce, string s_text );
+};
+
+class Session {
+private:
+  Base64Key key;
+  ae_ctx *ctx;
+
+public:
+  Session( Base64Key s_key );
+  ~Session();
+
+  string encrypt( Message plaintext );
+  Message decrypt( string ciphertext );
+
+  Session( const Session & );
+  Session & operator=( const Session & );
+};
+
+#endif
diff --git a/decrypt.cpp b/decrypt.cpp
new file mode 100644
index 0000000..5f2d13b
--- /dev/null
+++ b/decrypt.cpp
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <iostream>
+
+#include "crypto.hpp"
+
+int main( int argc, char *argv[] )
+{
+  if ( argc != 2 ) {
+    fprintf( stderr, "Usage: %s KEY\n", argv[ 0 ] );
+    return 1;
+  }
+
+  try {
+    Base64Key key( argv[ 1 ] );
+    Session session( key );
+
+    /* Read input */
+    char *input = NULL;
+    int total_size = 0;
+
+    while ( 1 ) {
+      unsigned char buf[ 16384 ];
+      ssize_t bytes_read = read( STDIN_FILENO, buf, 16384 );
+      if ( bytes_read == 0 ) { /* EOF */
+	break;
+      } else if ( bytes_read < 0 ) {
+	perror( "read" );
+	exit( 1 );
+      } else {
+	input = (char *)realloc( input, total_size + bytes_read );
+	assert( input );
+	memcpy( input + total_size, buf, bytes_read );
+	total_size += bytes_read;
+      }
+    }
+
+    string ciphertext( input, total_size );
+    free( input );
+
+    /* Decrypt message */
+
+    Message message = session.decrypt( ciphertext );
+
+    fprintf( stderr, "Nonce = %ld\n",
+	     message.nonce.val() );
+    cout << message.text;
+  } catch ( CryptoException e ) {
+    cerr << e.text << endl;
+    exit( 1 );
+  }
+
+  return 0;
+}
diff --git a/encrypt.cpp b/encrypt.cpp
new file mode 100644
index 0000000..b6d6d95
--- /dev/null
+++ b/encrypt.cpp
@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <iostream>
+
+#include "crypto.hpp"
+
+long int myatoi( char *str )
+{
+  char *end;
+
+  errno = 0;
+  long int ret = strtol( str, &end, 10 );
+
+  if ( ( errno != 0 )
+       || ( end != str + strlen( str ) ) ) {
+    throw CryptoException( "Bad integer." );
+  }
+
+  return ret;
+}
+
+int main( int argc, char *argv[] )
+{
+  if ( argc != 2 ) {
+    fprintf( stderr, "Usage: %s NONCE\n", argv[ 0 ] );
+    return 1;
+  }
+
+  try {
+    Base64Key key;
+    Session session( key );
+    Nonce nonce( myatoi( argv[ 1 ] ) );
+
+    /* Read input */
+    char *input = NULL;
+    int total_size = 0;
+
+    while ( 1 ) {
+      unsigned char buf[ 16384 ];
+      ssize_t bytes_read = read( STDIN_FILENO, buf, 16384 );
+      if ( bytes_read == 0 ) { /* EOF */
+	break;
+      } else if ( bytes_read < 0 ) {
+	perror( "read" );
+	exit( 1 );
+      } else {
+	input = (char *)realloc( input, total_size + bytes_read );
+	assert( input );
+	memcpy( input + total_size, buf, bytes_read );
+	total_size += bytes_read;
+      }
+    }
+
+    string plaintext( input, total_size );
+    free( input );
+
+    /* Encrypt message */
+
+    string ciphertext = session.encrypt( Message( nonce, plaintext ) );
+
+    cerr << "Key: " << key.printable_key() << endl;
+
+    cout << ciphertext;
+  } catch ( CryptoException e ) {
+    cerr << e.text << endl;
+    exit( 1 );
+  }
+
+  return 0;
+}
diff --git a/grant.htm b/grant.htm
new file mode 100644
index 0000000..707d968
--- /dev/null
+++ b/grant.htm
@@ -0,0 +1,38 @@
+<TITLE>OCB - An Authenticated-Encryption Scheme - GPL Patent Grant - Rogaway</TITLE>
+
+<body bgcolor="#FFFFFF">
+<H2><a name="ocb-grant"> <font face="Arial, Helvetica, sans-serif" size="6" color="#FF0000">OCB: 
+  Patent Grant for GNU GPL</font> </a> </H2>
+
+Whereas I, Phillip Rogaway (hereinafter "Inventor") have sought 
+patent protection for certain technology 
+(hereinafter "Patented Technology"), 
+and Inventor wishes to aid the Free Software Foundation in achieving its goals, 
+and Inventor wishes to increase public awareness of Patented Technology, 
+Inventor hereby grants a fully paid-up, nonexclusive, 
+royalty-free license to 
+practice any patents claiming priority to the 
+patent applications below ("the Patents") 
+if practiced by
+software distributed 
+under the terms of any version of 
+the GNU General Public License as published by the Free Software Foundation, 
+59 Temple Place, Suite 330, Boston, MA 02111. 
+Inventor reserves all other rights, including without limitation
+licensing for software not distributed under the GNU General Public License. 
+
+<h4>The patents:</h4>
+
+
+<ul>
+<li> <a href="http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=2&f=G&l=50&co1=AND&d=PG01&s1=rogaway.IN.&OS=IN/rogaway&RS=IN/rogaway">
+09/918,615</a>  -
+Method and Apparatus for Facilitating Efficient Authenticated Encryption.
+
+<li> <a href="http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=3&f=G&l=50&co1=AND&d=PG01&s1=rogaway.IN.&OS=IN/rogaway&RS=IN/rogaway">
+09/948,084</a> - 
+Method and Apparatus for Realizing a Parallelizable Variable-Input-Length 
+Pseudorandom Function. 
+</ul>
+
+
diff --git a/ocb.cpp b/ocb.cpp
new file mode 100644
index 0000000..3ac86cf
--- /dev/null
+++ b/ocb.cpp
@@ -0,0 +1,1226 @@
+/*------------------------------------------------------------------------
+/ OCB Version 3 Reference Code (Optimized C)     Last modified 13-JUL-2011
+/-------------------------------------------------------------------------
+/ Copyright (c) 2011 Ted Krovetz.
+/
+/ Permission to use, copy, modify, and/or distribute this software for any
+/ purpose with or without fee is hereby granted, provided that the above
+/ copyright notice and this permission notice appear in all copies.
+/
+/ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+/ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+/ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+/ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+/ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+/ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+/ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+/
+/ Phillip Rogaway holds patents relevant to OCB. See the following for
+/ his patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm
+/
+/ Comments are welcome: Ted Krovetz <ted@krovetz.net> - Dedicated to Laurel K
+/------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Usage notes                                                             */
+/* ----------------------------------------------------------------------- */
+
+/* - When AE_PENDING is passed as the 'final' parameter of any function,
+/    the length parameters must be a multiple of (BPI*16).
+/  - When available, SSE or AltiVec registers are used to manipulate data.
+/    So, when on machines with these facilities, all pointers passed to
+/    any function should be 16-byte aligned.
+/  - Plaintext and ciphertext pointers may be equal (ie, plaintext gets
+/    encrypted in-place), but no other pair of pointers may be equal.      
+/  - This code assumes all x86 processors have SSE2 and SSSE3 instructions
+/    when compiling under MSVC. If untrue, alter the #define.
+/  - This code is tested for C99 and recent versions of GCC and MSVC.      */
+
+/* ----------------------------------------------------------------------- */
+/* User configuration options                                              */
+/* ----------------------------------------------------------------------- */
+
+/* Set the AES key length to use and length of authentication tag to produce.
+/  Setting either to 0 requires the value be set at runtime via ae_init().
+/  Some optimizations occur for each when set to a fixed value.            */
+#define OCB_KEY_LEN         16  /* 0, 16, 24 or 32. 0 means set in ae_init */
+#define OCB_TAG_LEN         16  /* 0 to 16. 0 means set in ae_init         */
+
+/* This implementation has built-in support for multiple AES APIs. Set any
+/  one of the following to non-zero to specify which to use.               */
+#define USE_OPENSSL_AES      1  /* http://openssl.org                      */
+#define USE_REFERENCE_AES    0  /* Internet search: rijndael-alg-fst.c     */
+#define USE_AES_NI           0  /* Uses compiler's intrinsics              */
+
+/* During encryption and decryption, various "L values" are required.
+/  The L values can be precomputed during initialization (requiring extra
+/  space in ae_ctx), generated as needed (slightly slowing encryption and
+/  decryption), or some combination of the two. L_TABLE_SZ specifies how many
+/  L values to precomute. L_TABLE_SZ must be at least 3. L_TABLE_SZ*16 bytes
+/  are used for L values in ae_ctx. Plaintext and ciphertexts shorter than
+/  2^L_TABLE_SZ blocks need no L values calculated dynamically.            */
+#define L_TABLE_SZ          16
+
+/* Set L_TABLE_SZ_IS_ENOUGH non-zero iff you know that all plaintexts
+/  will be shorter than 2^(L_TABLE_SZ+4) bytes in length. This results
+/  in better performance.                                                  */
+#define L_TABLE_SZ_IS_ENOUGH 1
+
+/* ----------------------------------------------------------------------- */
+/* Includes and compiler specific definitions                              */
+/* ----------------------------------------------------------------------- */
+
+#include "ae.hpp"
+#include <stdlib.h>
+#include <string.h>
+
+/* Define standard sized integers                                          */
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+	typedef unsigned __int8  uint8_t;
+	typedef unsigned __int32 uint32_t;
+	typedef unsigned __int64 uint64_t;
+	typedef          __int64 int64_t;
+#else
+	#include <stdint.h>
+#endif
+
+/* Compiler-specific intrinsics and fixes: bswap64, ntz                    */
+#if _MSC_VER
+	#define inline __inline        /* MSVC doesn't recognize "inline" in C */
+	#define restrict __restrict  /* MSVC doesn't recognize "restrict" in C */
+    #define __SSE2__   (_M_IX86 || _M_AMD64 || _M_X64)    /* Assume SSE2  */
+    #define __SSSE3__  (_M_IX86 || _M_AMD64 || _M_X64)    /* Assume SSSE3 */
+	#include <intrin.h>
+	#pragma intrinsic(_byteswap_uint64, _BitScanForward, memcpy)
+	#define bswap64(x) _byteswap_uint64(x)
+	static inline unsigned ntz(unsigned x) {_BitScanForward(&x,x);return x;}
+#elif __GNUC__
+	#define inline __inline__            /* No "inline" in GCC ansi C mode */
+	#define restrict __restrict__      /* No "restrict" in GCC ansi C mode */
+	#define bswap64(x) __builtin_bswap64(x)           /* Assuming GCC 4.3+ */
+	#define ntz(x)     __builtin_ctz((unsigned)(x))   /* Assuming GCC 3.4+ */
+#else              /* Assume some C99 features: stdint.h, inline, restrict */
+	#define bswap32(x)                                              \
+	   ((((x) & 0xff000000u) >> 24) | (((x) & 0x00ff0000u) >>  8) | \
+		(((x) & 0x0000ff00u) <<  8) | (((x) & 0x000000ffu) << 24))
+
+	 static inline uint64_t bswap64(uint64_t x) {
+		union { uint64_t u64; uint32_t u32[2]; } in, out;
+		in.u64 = x;
+		out.u32[0] = bswap32(in.u32[1]);
+		out.u32[1] = bswap32(in.u32[0]);
+		return out.u64;
+	}
+    
+	#if (L_TABLE_SZ <= 9) && (L_TABLE_SZ_IS_ENOUGH)   /* < 2^13 byte texts */
+	static inline unsigned ntz(unsigned x) {
+		static const unsigned char tz_table[] = {0, 
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,7,
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,8,
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,7,
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2};
+		return tz_table[x/4];
+	}
+	#else       /* From http://supertech.csail.mit.edu/papers/debruijn.pdf */
+	static inline unsigned ntz(unsigned x) {
+		static const unsigned char tz_table[32] = 
+		{ 0,  1, 28,  2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17,  4, 8, 
+		 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18,  6, 11,  5, 10, 9};
+		return tz_table[((uint32_t)((x & -x) * 0x077CB531u)) >> 27];
+	}
+	#endif
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Define blocks and operations -- Patch if incorrect on your compiler.    */
+/* ----------------------------------------------------------------------- */
+
+#if __SSE2__
+    #include <xmmintrin.h>              /* SSE instructions and _mm_malloc */
+    #include <emmintrin.h>              /* SSE2 instructions               */
+    typedef __m128i block;
+    #define xor_block(x,y)        _mm_xor_si128(x,y)
+    #define zero_block()          _mm_setzero_si128()
+    #define unequal_blocks(x,y) \
+    					   (_mm_movemask_epi8(_mm_cmpeq_epi8(x,y)) != 0xffff)
+	#if __SSSE3__ || USE_AES_NI
+    #include <tmmintrin.h>              /* SSSE3 instructions              */
+    #define swap_if_le(b) \
+      _mm_shuffle_epi8(b,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15))
+	#else
+    static inline block swap_if_le(block b) {
+		block a = _mm_shuffle_epi32  (b, _MM_SHUFFLE(0,1,2,3));
+		a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2,3,0,1));
+		a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2,3,0,1));
+		return _mm_xor_si128(_mm_srli_epi16(a,8), _mm_slli_epi16(a,8));
+    }
+	#endif
+	static inline block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		block hi = _mm_load_si128((__m128i *)(KtopStr+0));   /* hi = B A */
+		block lo = _mm_loadu_si128((__m128i *)(KtopStr+1));  /* lo = C B */
+		__m128i lshift = _mm_cvtsi32_si128(bot);
+		__m128i rshift = _mm_cvtsi32_si128(64-bot);
+		lo = _mm_xor_si128(_mm_sll_epi64(hi,lshift),_mm_srl_epi64(lo,rshift));
+		#if __SSSE3__ || USE_AES_NI
+		return _mm_shuffle_epi8(lo,_mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7));
+		#else
+		return swap_if_le(_mm_shuffle_epi32(lo, _MM_SHUFFLE(1,0,3,2)));
+		#endif
+	}
+	static inline block double_block(block bl) {
+		const __m128i mask = _mm_set_epi32(135,1,1,1);
+		__m128i tmp = _mm_srai_epi32(bl, 31);
+		tmp = _mm_and_si128(tmp, mask);
+		tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3));
+		bl = _mm_slli_epi32(bl, 1);
+		return _mm_xor_si128(bl,tmp);
+	}
+#elif __ALTIVEC__
+    #include <altivec.h>
+    typedef vector unsigned block;
+    #define xor_block(x,y)         vec_xor(x,y)
+    #define zero_block()           vec_splat_u32(0)
+    #define unequal_blocks(x,y)    vec_any_ne(x,y)
+    #define swap_if_le(b)          (b)
+	#if __PPC64__
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		union {uint64_t u64[2]; block bl;} rval;
+		rval.u64[0] = (KtopStr[0] << bot) | (KtopStr[1] >> (64-bot));
+		rval.u64[1] = (KtopStr[1] << bot) | (KtopStr[2] >> (64-bot));
+        return rval.bl;
+	}
+	#else
+	/* Special handling: Shifts are mod 32, and no 64-bit types */
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		const vector unsigned k32 = {32,32,32,32};
+		vector unsigned hi = *(vector unsigned *)(KtopStr+0);
+		vector unsigned lo = *(vector unsigned *)(KtopStr+2);
+		vector unsigned bot_vec;
+		if (bot < 32) {
+			lo = vec_sld(hi,lo,4);
+		} else {
+			vector unsigned t = vec_sld(hi,lo,4);
+			lo = vec_sld(hi,lo,8);
+			hi = t;
+			bot = bot - 32;
+		}
+		if (bot == 0) return hi;
+		*(unsigned *)&bot_vec = bot;
+		vector unsigned lshift = vec_splat(bot_vec,0);
+		vector unsigned rshift = vec_sub(k32,lshift);
+		hi = vec_sl(hi,lshift);
+		lo = vec_sr(lo,rshift);
+		return vec_xor(hi,lo);
+	}
+	#endif
+	static inline block double_block(block b) {
+		const vector unsigned char mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+		const vector unsigned char perm = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0};
+		const vector unsigned char shift7  = vec_splat_u8(7);
+		const vector unsigned char shift1  = vec_splat_u8(1);
+		vector unsigned char c = (vector unsigned char)b;
+		vector unsigned char t = vec_sra(c,shift7);
+		t = vec_and(t,mask);
+		t = vec_perm(t,t,perm);
+		c = vec_sl(c,shift1);
+		return (block)vec_xor(c,t);
+	}
+#elif __ARM_NEON__
+    #include <arm_neon.h>
+    typedef int8x16_t block;      /* Yay! Endian-neutral reads! */
+    #define xor_block(x,y)             veorq_s8(x,y)
+    #define zero_block()               vdupq_n_s8(0)
+    static inline int unequal_blocks(block a, block b) {
+		int64x2_t t=veorq_s64((int64x2_t)a,(int64x2_t)b);
+		return (vgetq_lane_s64(t,0)|vgetq_lane_s64(t,1))!=0;
+    }
+    #define swap_if_le(b)          (b)  /* Using endian-neutral int8x16_t */
+	/* KtopStr is reg correct by 64 bits, return mem correct */
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		const union { unsigned x; unsigned char endian; } little = { 1 };
+		const int64x2_t k64 = {-64,-64};
+		uint64x2_t hi = *(uint64x2_t *)(KtopStr+0);   /* hi = A B */
+		uint64x2_t lo = *(uint64x2_t *)(KtopStr+1);   /* hi = B C */
+		int64x2_t ls = vdupq_n_s64(bot);
+		int64x2_t rs = vqaddq_s64(k64,ls);
+		block rval = (block)veorq_u64(vshlq_u64(hi,ls),vshlq_u64(lo,rs));
+		if (little.endian)
+			rval = vrev64q_s8(rval);
+		return rval;
+	}
+	static inline block double_block(block b)
+	{
+		const block mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+		block tmp = vshrq_n_s8(b,7);
+		tmp = vandq_s8(tmp, mask);
+		tmp = vextq_s8(tmp, tmp, 1);  /* Rotate high byte to end */
+		b = vshlq_n_s8(b,1);
+		return veorq_s8(tmp,b);
+	}
+#else
+    typedef struct { uint64_t l,r; } block;
+    static inline block xor_block(block x, block y) {
+    	x.l^=y.l; x.r^=y.r; return x;
+    }
+    static inline block zero_block(void) { const block t = {0,0}; return t; }
+    #define unequal_blocks(x, y)         ((((x).l^(y).l)|((x).r^(y).r)) != 0)
+    static inline block swap_if_le(block b) {
+		const union { unsigned x; unsigned char endian; } little = { 1 };
+    	if (little.endian) {
+    		block r;
+    		r.l = bswap64(b.l);
+    		r.r = bswap64(b.r);
+    		return r;
+    	} else
+    		return b;
+    }
+	
+	/* KtopStr is reg correct by 64 bits, return mem correct */
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+        block rval;
+        if (bot != 0) {
+			rval.l = (KtopStr[0] << bot) | (KtopStr[1] >> (64-bot));
+			rval.r = (KtopStr[1] << bot) | (KtopStr[2] >> (64-bot));
+		} else {
+			rval.l = KtopStr[0];
+			rval.r = KtopStr[1];
+		}
+        return swap_if_le(rval);
+	}
+
+	#if __GNUC__ && __arm__
+	static inline block double_block(block b) {
+		__asm__ ("adds %1,%1,%1\n\t"
+				 "adcs %H1,%H1,%H1\n\t"
+				 "adcs %0,%0,%0\n\t"
+				 "adcs %H0,%H0,%H0\n\t"
+				 "eorcs %1,%1,#135"
+		: "+r"(b.l), "+r"(b.r) : : "cc");
+		return b;
+	}
+	#else
+	static inline block double_block(block b) {
+		uint64_t t = (uint64_t)((int64_t)b.l >> 63);
+		b.l = (b.l + b.l) ^ (b.r >> 63);
+		b.r = (b.r + b.r) ^ (t & 135);
+		return b;
+	}
+	#endif
+    
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* AES - Code uses OpenSSL API. Other implementations get mapped to it.    */
+/* ----------------------------------------------------------------------- */
+
+/*---------------*/
+#if USE_OPENSSL_AES
+/*---------------*/
+
+#include <openssl/aes.h>                            /* http://openssl.org/ */
+
+/* How to ECB encrypt an array of blocks, in place                         */
+static inline void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_encrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+static inline void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_decrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+#define BPI 4  /* Number of blocks in buffer per ECB call */
+
+/*-------------------*/
+#elif USE_REFERENCE_AES
+/*-------------------*/
+
+#include "rijndael-alg-fst.h"              /* Barreto's Public-Domain Code */
+#if (OCB_KEY_LEN == 0)
+	typedef struct { uint32_t rd_key[60]; int rounds; } AES_KEY;
+	#define ROUNDS(ctx) ((ctx)->rounds)
+	#define AES_set_encrypt_key(x, y, z) \
+	 do {rijndaelKeySetupEnc((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)
+	#define AES_set_decrypt_key(x, y, z) \
+	 do {rijndaelKeySetupDec((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)
+#else
+	typedef struct { uint32_t rd_key[OCB_KEY_LEN+28]; } AES_KEY;
+	#define ROUNDS(ctx) (6+OCB_KEY_LEN/4)
+	#define AES_set_encrypt_key(x, y, z) rijndaelKeySetupEnc((z)->rd_key, x, y)
+	#define AES_set_decrypt_key(x, y, z) rijndaelKeySetupDec((z)->rd_key, x, y)
+#endif
+#define AES_encrypt(x,y,z) rijndaelEncrypt((z)->rd_key, ROUNDS(z), x, y)
+#define AES_decrypt(x,y,z) rijndaelDecrypt((z)->rd_key, ROUNDS(z), x, y)
+
+static void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_encrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+ void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_decrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+#define BPI 4  /* Number of blocks in buffer per ECB call */
+
+/*----------*/
+#elif USE_AES_NI
+/*----------*/
+
+#include <wmmintrin.h>
+
+#if (OCB_KEY_LEN == 0)
+	typedef struct { __m128i rd_key[15]; int rounds; } AES_KEY;
+	#define ROUNDS(ctx) ((ctx)->rounds)
+#else
+	typedef struct { __m128i rd_key[7+OCB_KEY_LEN/4]; } AES_KEY;
+	#define ROUNDS(ctx) (6+OCB_KEY_LEN/4)
+#endif
+
+#define EXPAND_ASSIST(v1,v2,v3,v4,shuff_const,aes_const)                    \
+    v2 = _mm_aeskeygenassist_si128(v4,aes_const);                           \
+    v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3),              \
+                                         _mm_castsi128_ps(v1), 16));        \
+    v1 = _mm_xor_si128(v1,v3);                                              \
+    v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3),              \
+                                         _mm_castsi128_ps(v1), 140));       \
+    v1 = _mm_xor_si128(v1,v3);                                              \
+    v2 = _mm_shuffle_epi32(v2,shuff_const);                                 \
+    v1 = _mm_xor_si128(v1,v2)
+
+#define EXPAND192_STEP(idx,aes_const)                                       \
+    EXPAND_ASSIST(x0,x1,x2,x3,85,aes_const);                                \
+    x3 = _mm_xor_si128(x3,_mm_slli_si128 (x3, 4));                          \
+    x3 = _mm_xor_si128(x3,_mm_shuffle_epi32(x0, 255));                      \
+    kp[idx] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp),        \
+                                              _mm_castsi128_ps(x0), 68));   \
+    kp[idx+1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0),       \
+                                                _mm_castsi128_ps(x3), 78)); \
+    EXPAND_ASSIST(x0,x1,x2,x3,85,(aes_const*2));                            \
+    x3 = _mm_xor_si128(x3,_mm_slli_si128 (x3, 4));                          \
+    x3 = _mm_xor_si128(x3,_mm_shuffle_epi32(x0, 255));                      \
+    kp[idx+2] = x0; tmp = x3
+
+void AES_128_Key_Expansion(const unsigned char *userkey, void *key)
+{
+    __m128i x0,x1,x2;
+    __m128i *kp = (__m128i *)key;
+    kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey);
+    x2 = _mm_setzero_si128();
+    EXPAND_ASSIST(x0,x1,x2,x0,255,1);   kp[1]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,2);   kp[2]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,4);   kp[3]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,8);   kp[4]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,16);  kp[5]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,32);  kp[6]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,64);  kp[7]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,128); kp[8]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,27);  kp[9]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,54);  kp[10] = x0;
+}
+
+void AES_192_Key_Expansion(const unsigned char *userkey, void *key)
+{
+    __m128i x0,x1,x2,x3,tmp,*kp = (__m128i *)key;
+    kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey);
+    tmp = x3 = _mm_loadu_si128((__m128i*)(userkey+16));
+    x2 = _mm_setzero_si128();
+    EXPAND192_STEP(1,1);
+    EXPAND192_STEP(4,4);
+    EXPAND192_STEP(7,16);
+    EXPAND192_STEP(10,64);
+}
+
+void AES_256_Key_Expansion(const unsigned char *userkey, void *key)
+{
+    __m128i x0,x1,x2,x3,*kp = (__m128i *)key;
+    kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey   );
+    kp[1] = x3 = _mm_loadu_si128((__m128i*)(userkey+16));
+    x2 = _mm_setzero_si128();
+    EXPAND_ASSIST(x0,x1,x2,x3,255,1);  kp[2]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,1);  kp[3]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,2);  kp[4]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,2);  kp[5]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,4);  kp[6]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,4);  kp[7]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,8);  kp[8]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,8);  kp[9]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,16); kp[10] = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,16); kp[11] = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,32); kp[12] = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,32); kp[13] = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,64); kp[14] = x0;
+}
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
+{
+    if (bits == 128) {
+        AES_128_Key_Expansion (userKey,key);
+    } else if (bits == 192) {
+        AES_192_Key_Expansion (userKey,key);
+    } else if (bits == 256) {
+        AES_256_Key_Expansion (userKey,key);
+    }
+    #if (OCB_KEY_LEN == 0)
+    	key->rounds = 6+bits/32;
+    #endif
+    return 0;
+}
+
+ void AES_set_decrypt_key_fast(AES_KEY *dkey, const AES_KEY *ekey)
+{
+    int j = 0;
+    int i = ROUNDS(ekey);
+    #if (OCB_KEY_LEN == 0)
+    	dkey->rounds = i;
+    #endif
+    dkey->rd_key[i--] = ekey->rd_key[j++];
+    while (i)
+        dkey->rd_key[i--] = _mm_aesimc_si128(ekey->rd_key[j++]);
+    dkey->rd_key[i] = ekey->rd_key[j];
+}
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
+{
+    AES_KEY temp_key;
+    AES_set_encrypt_key(userKey,bits,&temp_key);
+    AES_set_decrypt_key_fast(key, &temp_key);
+    return 0;
+}
+
+static inline void AES_encrypt(const unsigned char *in,
+                        unsigned char *out, const AES_KEY *key)
+{
+	int j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	__m128i tmp = _mm_load_si128 ((__m128i*)in);
+	tmp = _mm_xor_si128 (tmp,sched[0]);
+	for (j=1; j<rnds; j++)  tmp = _mm_aesenc_si128 (tmp,sched[j]);
+	tmp = _mm_aesenclast_si128 (tmp,sched[j]);
+	_mm_store_si128 ((__m128i*)out,tmp);
+}
+
+static inline void AES_decrypt(const unsigned char *in,
+                        unsigned char *out, const AES_KEY *key)
+{
+	int j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	__m128i tmp = _mm_load_si128 ((__m128i*)in);
+	tmp = _mm_xor_si128 (tmp,sched[0]);
+	for (j=1; j<rnds; j++)  tmp = _mm_aesdec_si128 (tmp,sched[j]);
+	tmp = _mm_aesdeclast_si128 (tmp,sched[j]);
+	_mm_store_si128 ((__m128i*)out,tmp);
+}
+
+static inline void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+    unsigned i,j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_xor_si128(blks[i], sched[0]);
+	for(j=1; j<rnds; ++j)
+	    for (i=0; i<nblks; ++i)
+		    blks[i] = _mm_aesenc_si128(blks[i], sched[j]);
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_aesenclast_si128(blks[i], sched[j]);
+}
+
+static inline void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+    unsigned i,j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_xor_si128(blks[i], sched[0]);
+	for(j=1; j<rnds; ++j)
+	    for (i=0; i<nblks; ++i)
+		    blks[i] = _mm_aesdec_si128(blks[i], sched[j]);
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_aesdeclast_si128(blks[i], sched[j]);
+}
+
+#define BPI 8  /* Number of blocks in buffer per ECB call   */
+               /* Set to 4 for Westmere, 8 for Sandy Bridge */
+
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Define OCB context structure.                                           */
+/* ----------------------------------------------------------------------- */
+
+/*------------------------------------------------------------------------
+/ Each item in the OCB context is stored either "memory correct" or
+/ "register correct". On big-endian machines, this is identical. On
+/ little-endian machines, one must choose whether the byte-string
+/ is in the corrct order when it resides in memory or in registers.
+/ It must be register correct whenever it is to be manipulated
+/ arithmetically, but must be memory correct whenever it interacts
+/ with the plaintext or ciphertext.
+/------------------------------------------------------------------------- */
+ 
+struct _ae_ctx {
+    block offset;                          /* Memory correct               */
+    block checksum;                        /* Memory correct               */
+    block Lstar;                           /* Memory correct               */
+    block Ldollar;                         /* Memory correct               */
+    block L[L_TABLE_SZ];                   /* Memory correct               */
+    block ad_checksum;                     /* Memory correct               */
+    block ad_offset;                       /* Memory correct               */
+    block cached_Top;                      /* Memory correct               */
+	uint64_t KtopStr[3];                   /* Register correct, each item  */
+    uint32_t ad_blocks_processed;
+    uint32_t blocks_processed;
+    AES_KEY decrypt_key;
+    AES_KEY encrypt_key;
+    #if (OCB_TAG_LEN == 0)
+    unsigned tag_len;
+    #endif
+};
+
+/* ----------------------------------------------------------------------- */
+/* L table lookup (or on-the-fly generation)                               */
+/* ----------------------------------------------------------------------- */
+
+#if L_TABLE_SZ_IS_ENOUGH
+#define getL(_ctx, _tz) ((_ctx)->L[_tz])
+#else
+static block getL(const ae_ctx *ctx, unsigned tz)
+{
+    if (tz < L_TABLE_SZ)
+        return ctx->L[tz];
+    else {
+        unsigned i;
+        /* Bring L[MAX] into registers, make it register correct */
+        block rval = swap_if_le(ctx->L[L_TABLE_SZ-1]);
+        rval = double_block(rval);
+        for (i=L_TABLE_SZ; i < tz; i++)
+            rval = double_block(rval);
+        return swap_if_le(rval);             /* To memory correct */
+    }
+}
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Public functions                                                        */
+/* ----------------------------------------------------------------------- */
+
+/* 32-bit SSE2 and Altivec systems need to be forced to allocate memory
+   on 16-byte alignments. (I believe all major 64-bit systems do already.) */
+
+ae_ctx* ae_allocate(void *misc)
+{ 
+	void *p;
+	(void) misc;                     /* misc unused in this implementation */
+	#if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__)
+    	p = _mm_malloc(sizeof(ae_ctx),16); 
+	#elif (__ALTIVEC__ && !__PPC64__)
+		if (posix_memalign(&p,16,sizeof(ae_ctx)) != 0) p = NULL;
+	#else
+		p = malloc(sizeof(ae_ctx));
+	#endif
+	return (ae_ctx *)p;
+}
+
+void ae_free(ae_ctx *ctx)
+{
+	#if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__)
+		_mm_free(ctx);
+	#else
+		free(ctx);
+	#endif
+}
+
+/* ----------------------------------------------------------------------- */
+
+int ae_clear (ae_ctx *ctx) /* Zero ae_ctx and undo initialization          */
+{
+	memset(ctx, 0, sizeof(ae_ctx));
+	return AE_SUCCESS;
+}
+
+int ae_ctx_sizeof(void) { return (int) sizeof(ae_ctx); }
+
+/* ----------------------------------------------------------------------- */
+
+int ae_init(ae_ctx *ctx, const void *key, int key_len, int nonce_len, int tag_len)
+{
+    unsigned i;
+    block tmp_blk;
+    
+    if (nonce_len != 12)
+    	return AE_NOT_SUPPORTED;
+    
+    /* Initialize encryption & decryption keys */
+    #if (OCB_KEY_LEN > 0)
+    key_len = OCB_KEY_LEN;
+    #endif
+    AES_set_encrypt_key((unsigned char *)key, key_len*8, &ctx->encrypt_key);
+    #if USE_AES_NI
+    AES_set_decrypt_key_fast(&ctx->decrypt_key,&ctx->encrypt_key);
+    #else
+    AES_set_decrypt_key((unsigned char *)key, (int)(key_len*8), &ctx->decrypt_key);
+    #endif
+    
+    /* Zero things that need zeroing */
+    ctx->cached_Top = ctx->ad_checksum = zero_block();
+    ctx->ad_blocks_processed = 0;
+    
+    /* Compute key-dependent values */
+    AES_encrypt((unsigned char *)&ctx->cached_Top,
+                            (unsigned char *)&ctx->Lstar, &ctx->encrypt_key);
+    tmp_blk = swap_if_le(ctx->Lstar);
+    tmp_blk = double_block(tmp_blk);
+    ctx->Ldollar = swap_if_le(tmp_blk);
+    tmp_blk = double_block(tmp_blk);
+    ctx->L[0] = swap_if_le(tmp_blk);
+    for (i = 1; i < L_TABLE_SZ; i++) {
+		tmp_blk = double_block(tmp_blk);
+    	ctx->L[i] = swap_if_le(tmp_blk);
+    }
+
+    #if (OCB_TAG_LEN == 0)
+    	ctx->tag_len = tag_len;
+    #else
+    	(void) tag_len;  /* Supress var not used error */
+    #endif
+
+    return AE_SUCCESS;
+}
+
+/* ----------------------------------------------------------------------- */
+
+static block gen_offset_from_nonce(ae_ctx *ctx, const void *nonce)
+{
+	const union { unsigned x; unsigned char endian; } little = { 1 };
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+	unsigned idx;
+
+	/* Replace cached nonce Top if needed */
+	tmp.u32[0] = (little.endian?0x01000000:0x00000001);
+	tmp.u32[1] = ((uint32_t *)nonce)[0];
+	tmp.u32[2] = ((uint32_t *)nonce)[1];
+	tmp.u32[3] = ((uint32_t *)nonce)[2];
+	idx = (unsigned)(tmp.u8[15] & 0x3f);   /* Get low 6 bits of nonce  */
+	tmp.u8[15] = tmp.u8[15] & 0xc0;        /* Zero low 6 bits of nonce */
+	if ( unequal_blocks(tmp.bl,ctx->cached_Top) )   { /* Cached?       */
+		ctx->cached_Top = tmp.bl;          /* Update cache, KtopStr    */
+		AES_encrypt(tmp.u8, (unsigned char *)&ctx->KtopStr, &ctx->encrypt_key);
+		if (little.endian) {               /* Make Register Correct    */
+			ctx->KtopStr[0] = bswap64(ctx->KtopStr[0]);
+			ctx->KtopStr[1] = bswap64(ctx->KtopStr[1]);
+		}
+		ctx->KtopStr[2] = ctx->KtopStr[0] ^
+						 (ctx->KtopStr[0] << 8) ^ (ctx->KtopStr[1] >> 56);
+	}
+	return gen_offset(ctx->KtopStr, idx);
+}
+
+ void process_ad(ae_ctx *ctx, const void *ad, int ad_len, int final)
+{
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+    block ad_offset, ad_checksum;
+    const block *  adp = (block *)ad;
+	unsigned i,k,tz,remaining;
+    
+    ad_offset = ctx->ad_offset;
+    ad_checksum = ctx->ad_checksum;
+    i = ad_len/(BPI*16);
+    if (i) {
+		unsigned ad_block_num = ctx->ad_blocks_processed;
+		do {
+			block ta[BPI], oa[BPI];
+			ad_block_num += BPI;
+			tz = ntz(ad_block_num);
+			oa[0] = xor_block(ad_offset, ctx->L[0]);
+			ta[0] = xor_block(oa[0], adp[0]);
+			oa[1] = xor_block(oa[0], ctx->L[1]);
+			ta[1] = xor_block(oa[1], adp[1]);
+			oa[2] = xor_block(ad_offset, ctx->L[1]);
+			ta[2] = xor_block(oa[2], adp[2]);
+			#if BPI == 4
+				ad_offset = xor_block(oa[2], getL(ctx, tz));
+				ta[3] = xor_block(ad_offset, adp[3]);
+			#elif BPI == 8
+				oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(oa[3], adp[3]);
+				oa[4] = xor_block(oa[1], ctx->L[2]);
+				ta[4] = xor_block(oa[4], adp[4]);
+				oa[5] = xor_block(oa[0], ctx->L[2]);
+				ta[5] = xor_block(oa[5], adp[5]);
+				oa[6] = xor_block(ad_offset, ctx->L[2]);
+				ta[6] = xor_block(oa[6], adp[6]);
+				ad_offset = xor_block(oa[6], getL(ctx, tz));
+				ta[7] = xor_block(ad_offset, adp[7]);
+			#endif
+			AES_ecb_encrypt_blks(ta,BPI,&ctx->encrypt_key);
+			ad_checksum = xor_block(ad_checksum, ta[0]);
+			ad_checksum = xor_block(ad_checksum, ta[1]);
+			ad_checksum = xor_block(ad_checksum, ta[2]);
+			ad_checksum = xor_block(ad_checksum, ta[3]);
+			#if (BPI == 8)
+			ad_checksum = xor_block(ad_checksum, ta[4]);
+			ad_checksum = xor_block(ad_checksum, ta[5]);
+			ad_checksum = xor_block(ad_checksum, ta[6]);
+			ad_checksum = xor_block(ad_checksum, ta[7]);
+			#endif
+			adp += BPI;
+		} while (--i);
+		ctx->ad_blocks_processed = ad_block_num;
+		ctx->ad_offset = ad_offset;
+		ctx->ad_checksum = ad_checksum;
+	}
+
+    if (final) {
+		block ta[BPI];
+		
+        /* Process remaining associated data, compute its tag contribution */
+        remaining = ((unsigned)ad_len) % (BPI*16);
+        if (remaining) {
+			k=0;
+			#if (BPI == 8)
+			if (remaining >= 64) {
+				tmp.bl = xor_block(ad_offset, ctx->L[0]);
+				ta[0] = xor_block(tmp.bl, adp[0]);
+				tmp.bl = xor_block(tmp.bl, ctx->L[1]);
+				ta[1] = xor_block(tmp.bl, adp[1]);
+				ad_offset = xor_block(ad_offset, ctx->L[1]);
+				ta[2] = xor_block(ad_offset, adp[2]);
+				ad_offset = xor_block(ad_offset, ctx->L[2]);
+				ta[3] = xor_block(ad_offset, adp[3]);
+				remaining -= 64;
+				k=4;
+			}
+			#endif
+			if (remaining >= 32) {
+				ad_offset = xor_block(ad_offset, ctx->L[0]);
+				ta[k] = xor_block(ad_offset, adp[k]);
+				ad_offset = xor_block(ad_offset, getL(ctx, ntz(k+2)));
+				ta[k+1] = xor_block(ad_offset, adp[k+1]);
+				remaining -= 32;
+				k+=2;
+			}
+			if (remaining >= 16) {
+				ad_offset = xor_block(ad_offset, ctx->L[0]);
+				ta[k] = xor_block(ad_offset, adp[k]);
+				remaining = remaining - 16;
+				++k;
+			}
+			if (remaining) {
+				ad_offset = xor_block(ad_offset,ctx->Lstar);
+				tmp.bl = zero_block();
+				memcpy(tmp.u8, adp+k, remaining);
+				tmp.u8[remaining] = (unsigned char)0x80u;
+				ta[k] = xor_block(ad_offset, tmp.bl);
+				++k;
+			}
+			AES_ecb_encrypt_blks(ta,k,&ctx->encrypt_key);
+			switch (k) {
+				#if (BPI == 8)
+				case 8: ad_checksum = xor_block(ad_checksum, ta[7]);
+				case 7: ad_checksum = xor_block(ad_checksum, ta[6]);
+				case 6: ad_checksum = xor_block(ad_checksum, ta[5]);
+				case 5: ad_checksum = xor_block(ad_checksum, ta[4]);
+				#endif
+				case 4: ad_checksum = xor_block(ad_checksum, ta[3]);
+				case 3: ad_checksum = xor_block(ad_checksum, ta[2]);
+				case 2: ad_checksum = xor_block(ad_checksum, ta[1]);
+				case 1: ad_checksum = xor_block(ad_checksum, ta[0]);
+			}
+			ctx->ad_checksum = ad_checksum;
+		}
+	}
+}
+
+/* ----------------------------------------------------------------------- */
+
+int ae_encrypt(ae_ctx     *  ctx,
+               const void *  nonce,
+               const void *pt,
+               int         pt_len,
+               const void *ad,
+               int         ad_len,
+               void       *ct,
+               void       *tag,
+               int         final)
+{
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+    block offset, checksum;
+    unsigned i, k;
+    block       * ctp = (block *)ct;
+    const block * ptp = (block *)pt;
+
+    /* Non-null nonce means start of new message, init per-message values */
+    if (nonce) {
+        ctx->offset = gen_offset_from_nonce(ctx, nonce);
+        ctx->ad_offset = ctx->checksum   = zero_block();
+        ctx->ad_blocks_processed = ctx->blocks_processed    = 0;
+        if (ad_len >= 0)
+        	ctx->ad_checksum = zero_block();
+    }
+
+	/* Process associated data */
+	if (ad_len > 0)
+		process_ad(ctx, ad, ad_len, final);
+
+	/* Encrypt plaintext data BPI blocks at a time */
+    offset = ctx->offset;
+    checksum  = ctx->checksum;
+    i = pt_len/(BPI*16);
+    if (i) {
+    	block oa[BPI];
+    	unsigned block_num = ctx->blocks_processed;
+    	oa[BPI-1] = offset;
+		do {
+			block ta[BPI];
+			block_num += BPI;
+			oa[0] = xor_block(oa[BPI-1], ctx->L[0]);
+			ta[0] = xor_block(oa[0], ptp[0]);
+			checksum = xor_block(checksum, ptp[0]);
+			oa[1] = xor_block(oa[0], ctx->L[1]);
+			ta[1] = xor_block(oa[1], ptp[1]);
+			checksum = xor_block(checksum, ptp[1]);
+			oa[2] = xor_block(oa[1], ctx->L[0]);
+			ta[2] = xor_block(oa[2], ptp[2]);
+			checksum = xor_block(checksum, ptp[2]);
+			#if BPI == 4
+				oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num)));
+				ta[3] = xor_block(oa[3], ptp[3]);
+				checksum = xor_block(checksum, ptp[3]);
+			#elif BPI == 8
+				oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(oa[3], ptp[3]);
+				checksum = xor_block(checksum, ptp[3]);
+				oa[4] = xor_block(oa[1], ctx->L[2]);
+				ta[4] = xor_block(oa[4], ptp[4]);
+				checksum = xor_block(checksum, ptp[4]);
+				oa[5] = xor_block(oa[0], ctx->L[2]);
+				ta[5] = xor_block(oa[5], ptp[5]);
+				checksum = xor_block(checksum, ptp[5]);
+				oa[6] = xor_block(oa[7], ctx->L[2]);
+				ta[6] = xor_block(oa[6], ptp[6]);
+				checksum = xor_block(checksum, ptp[6]);
+				oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num)));
+				ta[7] = xor_block(oa[7], ptp[7]);
+				checksum = xor_block(checksum, ptp[7]);
+			#endif
+			AES_ecb_encrypt_blks(ta,BPI,&ctx->encrypt_key);
+			ctp[0] = xor_block(ta[0], oa[0]);
+			ctp[1] = xor_block(ta[1], oa[1]);
+			ctp[2] = xor_block(ta[2], oa[2]);
+			ctp[3] = xor_block(ta[3], oa[3]);
+			#if (BPI == 8)
+			ctp[4] = xor_block(ta[4], oa[4]);
+			ctp[5] = xor_block(ta[5], oa[5]);
+			ctp[6] = xor_block(ta[6], oa[6]);
+			ctp[7] = xor_block(ta[7], oa[7]);
+			#endif
+			ptp += BPI;
+			ctp += BPI;
+		} while (--i);
+    	ctx->offset = offset = oa[BPI-1];
+	    ctx->blocks_processed = block_num;
+		ctx->checksum = checksum;
+    }
+    
+    if (final) {
+		block ta[BPI+1], oa[BPI];
+				
+        /* Process remaining plaintext and compute its tag contribution    */
+        unsigned remaining = ((unsigned)pt_len) % (BPI*16);
+        k = 0;                      /* How many blocks in ta[] need ECBing */
+        if (remaining) {
+			#if (BPI == 8)
+			if (remaining >= 64) {
+				oa[0] = xor_block(offset, ctx->L[0]);
+				ta[0] = xor_block(oa[0], ptp[0]);
+				checksum = xor_block(checksum, ptp[0]);
+				oa[1] = xor_block(oa[0], ctx->L[1]);
+				ta[1] = xor_block(oa[1], ptp[1]);
+				checksum = xor_block(checksum, ptp[1]);
+				oa[2] = xor_block(oa[1], ctx->L[0]);
+				ta[2] = xor_block(oa[2], ptp[2]);
+				checksum = xor_block(checksum, ptp[2]);
+				offset = oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(offset, ptp[3]);
+				checksum = xor_block(checksum, ptp[3]);
+				remaining -= 64;
+				k = 4;
+			}
+			#endif
+			if (remaining >= 32) {
+				oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(oa[k], ptp[k]);
+				checksum = xor_block(checksum, ptp[k]);
+				offset = oa[k+1] = xor_block(oa[k], ctx->L[1]);
+				ta[k+1] = xor_block(offset, ptp[k+1]);
+				checksum = xor_block(checksum, ptp[k+1]);
+				remaining -= 32;
+				k+=2;
+			}
+			if (remaining >= 16) {
+				offset = oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(offset, ptp[k]);
+				checksum = xor_block(checksum, ptp[k]);
+				remaining -= 16;
+				++k;
+			}
+			if (remaining) {
+				tmp.bl = zero_block();
+				memcpy(tmp.u8, ptp+k, remaining);
+				tmp.u8[remaining] = (unsigned char)0x80u;
+				checksum = xor_block(checksum, tmp.bl);
+				ta[k] = offset = xor_block(offset,ctx->Lstar);
+				++k;
+			}
+		}
+        offset = xor_block(offset, ctx->Ldollar);      /* Part of tag gen */
+        ta[k] = xor_block(offset, checksum);           /* Part of tag gen */
+		AES_ecb_encrypt_blks(ta,k+1,&ctx->encrypt_key);
+		offset = xor_block(ta[k], ctx->ad_checksum);   /* Part of tag gen */
+		if (remaining) {
+			--k;
+			tmp.bl = xor_block(tmp.bl, ta[k]);
+			memcpy(ctp+k, tmp.u8, remaining);
+		}
+		switch (k) {
+			#if (BPI == 8)
+			case 7: ctp[6] = xor_block(ta[6], oa[6]);
+			case 6: ctp[5] = xor_block(ta[5], oa[5]);
+			case 5: ctp[4] = xor_block(ta[4], oa[4]);
+			case 4: ctp[3] = xor_block(ta[3], oa[3]);
+			#endif
+			case 3: ctp[2] = xor_block(ta[2], oa[2]);
+			case 2: ctp[1] = xor_block(ta[1], oa[1]);
+			case 1: ctp[0] = xor_block(ta[0], oa[0]);
+		}
+        
+        /* Tag is placed at the correct location
+         */
+        if (tag) {
+			#if (OCB_TAG_LEN == 16)
+            	*(block *)tag = offset;
+			#elif (OCB_TAG_LEN > 0)
+	            memcpy((char *)tag, &offset, OCB_TAG_LEN);
+			#else
+	            memcpy((char *)tag, &offset, ctx->tag_len);
+	        #endif
+        } else {
+			#if (OCB_TAG_LEN > 0)
+	            memcpy((char *)ct + pt_len, &offset, OCB_TAG_LEN);
+            	pt_len += OCB_TAG_LEN;
+			#else
+	            memcpy((char *)ct + pt_len, &offset, ctx->tag_len);
+            	pt_len += ctx->tag_len;
+	        #endif
+        }
+    }
+    return (int) pt_len;
+}
+
+/* ----------------------------------------------------------------------- */
+
+int ae_decrypt(ae_ctx     *ctx,
+               const void *nonce,
+               const void *ct,
+               int         ct_len,
+               const void *ad,
+               int         ad_len,
+               void       *pt,
+               const void *tag,
+               int         final)
+{
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+    block offset, checksum;
+    unsigned i, k;
+    block       *ctp = (block *)ct;
+    block       *ptp = (block *)pt;
+		
+	/* Reduce ct_len tag bundled in ct */
+	if ((final) && (!tag))
+		#if (OCB_TAG_LEN > 0)
+			ct_len -= OCB_TAG_LEN;
+		#else
+			ct_len -= ctx->tag_len;
+		#endif
+
+    /* Non-null nonce means start of new message, init per-message values */
+    if (nonce) {
+        ctx->offset = gen_offset_from_nonce(ctx, nonce);
+        ctx->ad_offset = ctx->checksum   = zero_block();
+        ctx->ad_blocks_processed = ctx->blocks_processed    = 0;
+        if (ad_len >= 0)
+        	ctx->ad_checksum = zero_block();
+    }
+
+	/* Process associated data */
+	if (ad_len > 0)
+		process_ad(ctx, ad, ad_len, final);
+
+	/* Encrypt plaintext data BPI blocks at a time */
+    offset = ctx->offset;
+    checksum  = ctx->checksum;
+    i = ct_len/(BPI*16);
+    if (i) {
+    	block oa[BPI];
+    	unsigned block_num = ctx->blocks_processed;
+    	oa[BPI-1] = offset;
+		do {
+			block ta[BPI];
+			block_num += BPI;
+			oa[0] = xor_block(oa[BPI-1], ctx->L[0]);
+			ta[0] = xor_block(oa[0], ctp[0]);
+			oa[1] = xor_block(oa[0], ctx->L[1]);
+			ta[1] = xor_block(oa[1], ctp[1]);
+			oa[2] = xor_block(oa[1], ctx->L[0]);
+			ta[2] = xor_block(oa[2], ctp[2]);
+			#if BPI == 4
+				oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num)));
+				ta[3] = xor_block(oa[3], ctp[3]);
+			#elif BPI == 8
+				oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(oa[3], ctp[3]);
+				oa[4] = xor_block(oa[1], ctx->L[2]);
+				ta[4] = xor_block(oa[4], ctp[4]);
+				oa[5] = xor_block(oa[0], ctx->L[2]);
+				ta[5] = xor_block(oa[5], ctp[5]);
+				oa[6] = xor_block(oa[7], ctx->L[2]);
+				ta[6] = xor_block(oa[6], ctp[6]);
+				oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num)));
+				ta[7] = xor_block(oa[7], ctp[7]);
+			#endif
+			AES_ecb_decrypt_blks(ta,BPI,&ctx->decrypt_key);
+			ptp[0] = xor_block(ta[0], oa[0]);
+			checksum = xor_block(checksum, ptp[0]);
+			ptp[1] = xor_block(ta[1], oa[1]);
+			checksum = xor_block(checksum, ptp[1]);
+			ptp[2] = xor_block(ta[2], oa[2]);
+			checksum = xor_block(checksum, ptp[2]);
+			ptp[3] = xor_block(ta[3], oa[3]);
+			checksum = xor_block(checksum, ptp[3]);
+			#if (BPI == 8)
+			ptp[4] = xor_block(ta[4], oa[4]);
+			checksum = xor_block(checksum, ptp[4]);
+			ptp[5] = xor_block(ta[5], oa[5]);
+			checksum = xor_block(checksum, ptp[5]);
+			ptp[6] = xor_block(ta[6], oa[6]);
+			checksum = xor_block(checksum, ptp[6]);
+			ptp[7] = xor_block(ta[7], oa[7]);
+			checksum = xor_block(checksum, ptp[7]);
+			#endif
+			ptp += BPI;
+			ctp += BPI;
+		} while (--i);
+    	ctx->offset = offset = oa[BPI-1];
+	    ctx->blocks_processed = block_num;
+		ctx->checksum = checksum;
+    }
+    
+    if (final) {
+		block ta[BPI+1], oa[BPI];
+				
+        /* Process remaining plaintext and compute its tag contribution    */
+        unsigned remaining = ((unsigned)ct_len) % (BPI*16);
+        k = 0;                      /* How many blocks in ta[] need ECBing */
+        if (remaining) {
+			#if (BPI == 8)
+			if (remaining >= 64) {
+				oa[0] = xor_block(offset, ctx->L[0]);
+				ta[0] = xor_block(oa[0], ctp[0]);
+				oa[1] = xor_block(oa[0], ctx->L[1]);
+				ta[1] = xor_block(oa[1], ctp[1]);
+				oa[2] = xor_block(oa[1], ctx->L[0]);
+				ta[2] = xor_block(oa[2], ctp[2]);
+				offset = oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(offset, ctp[3]);
+				remaining -= 64;
+				k = 4;
+			}
+			#endif
+			if (remaining >= 32) {
+				oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(oa[k], ctp[k]);
+				offset = oa[k+1] = xor_block(oa[k], ctx->L[1]);
+				ta[k+1] = xor_block(offset, ctp[k+1]);
+				remaining -= 32;
+				k+=2;
+			}
+			if (remaining >= 16) {
+				offset = oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(offset, ctp[k]);
+				remaining -= 16;
+				++k;
+			}
+			if (remaining) {
+				block pad;
+				offset = xor_block(offset,ctx->Lstar);
+				AES_encrypt((unsigned char *)&offset, tmp.u8, &ctx->encrypt_key);
+				pad = tmp.bl;
+				memcpy(tmp.u8,ctp+k,remaining);
+				tmp.bl = xor_block(tmp.bl, pad);
+				tmp.u8[remaining] = (unsigned char)0x80u;
+				memcpy(ptp+k, tmp.u8, remaining);
+				checksum = xor_block(checksum, tmp.bl);
+			}
+		}
+		AES_ecb_decrypt_blks(ta,k,&ctx->decrypt_key);
+		switch (k) {
+			#if (BPI == 8)
+			case 7: ptp[6] = xor_block(ta[6], oa[6]);
+				    checksum = xor_block(checksum, ptp[6]);
+			case 6: ptp[5] = xor_block(ta[5], oa[5]);
+				    checksum = xor_block(checksum, ptp[5]);
+			case 5: ptp[4] = xor_block(ta[4], oa[4]);
+				    checksum = xor_block(checksum, ptp[4]);
+			case 4: ptp[3] = xor_block(ta[3], oa[3]);
+				    checksum = xor_block(checksum, ptp[3]);
+			#endif
+			case 3: ptp[2] = xor_block(ta[2], oa[2]);
+				    checksum = xor_block(checksum, ptp[2]);
+			case 2: ptp[1] = xor_block(ta[1], oa[1]);
+				    checksum = xor_block(checksum, ptp[1]);
+			case 1: ptp[0] = xor_block(ta[0], oa[0]);
+				    checksum = xor_block(checksum, ptp[0]);
+		}
+		
+		/* Calculate expected tag */
+        offset = xor_block(offset, ctx->Ldollar);
+        tmp.bl = xor_block(offset, checksum);
+		AES_encrypt(tmp.u8, tmp.u8, &ctx->encrypt_key);
+		tmp.bl = xor_block(tmp.bl, ctx->ad_checksum); /* Full tag */
+
+		/* Compare with proposed tag, change ct_len if invalid */
+		if ((OCB_TAG_LEN == 16) && tag) {
+			if (unequal_blocks(tmp.bl, *(block *)tag))
+				ct_len = AE_INVALID;
+		} else {
+			#if (OCB_TAG_LEN > 0)
+				int len = OCB_TAG_LEN;
+			#else
+				int len = ctx->tag_len;
+			#endif
+			if (tag) {
+				if (memcmp(tag,tmp.u8,len) != 0)
+					ct_len = AE_INVALID;
+			} else {
+				if (memcmp((char *)ct + ct_len,tmp.u8,len) != 0)
+					ct_len = AE_INVALID;
+			}
+		}
+    }
+    return ct_len;
+ }
+
+#if USE_AES_NI
+char infoString[] = "OCB (AES-NI)";
+#elif USE_REFERENCE_AES
+char infoString[] = "OCB (Reference AES)";
+#elif USE_OPENSSL_AES
+char infoString[] = "OCB (OpenSSL AES)";
+#endif