From 215c75c6eae825b6f7b48191fe0e9c675dfe7ab6 Mon Sep 17 00:00:00 2001
From: Keith Winstein <keithw@mit.edu>
Date: Thu, 4 Aug 2011 04:07:36 -0400
Subject: [PATCH] Import AES-OCB3 implementation and driver code

---
 Makefile    |   14 +-
 ae.hpp      |  182 ++++++++
 base64.cpp  |  577 ++++++++++++++++++++++++
 base64.h    |   65 +++
 crypto.cpp  |  211 +++++++++
 crypto.hpp  |   65 +++
 decrypt.cpp |   58 +++
 encrypt.cpp |   74 ++++
 grant.htm   |   38 ++
 ocb.cpp     | 1226 +++++++++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 2506 insertions(+), 4 deletions(-)
 create mode 100644 ae.hpp
 create mode 100644 base64.cpp
 create mode 100644 base64.h
 create mode 100644 crypto.cpp
 create mode 100644 crypto.hpp
 create mode 100644 decrypt.cpp
 create mode 100644 encrypt.cpp
 create mode 100644 grant.htm
 create mode 100644 ocb.cpp

diff --git a/Makefile b/Makefile
index b962c2e..57a3b6b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,11 @@
-source = parse.cpp parserstate.cpp parser.cpp templates.cpp terminal.cpp termemu.cpp parseraction.cpp terminalfunctions.cpp swrite.cpp terminalframebuffer.cpp terminaldispatcher.cpp terminaluserinput.cpp terminaldisplay.cpp network.cpp ntester.cpp
-objects = parserstate.o parser.o templates.o terminal.o parseraction.o terminalfunctions.o swrite.o terminalframebuffer.o terminaldispatcher.o terminaluserinput.o terminaldisplay.o network.o
+source = parse.cpp parserstate.cpp parser.cpp templates.cpp terminal.cpp termemu.cpp parseraction.cpp terminalfunctions.cpp swrite.cpp terminalframebuffer.cpp terminaldispatcher.cpp terminaluserinput.cpp terminaldisplay.cpp network.cpp ntester.cpp ocb.cpp base64.cpp encrypt.cpp decrypt.cpp crypto.cpp
+objects = parserstate.o parser.o templates.o terminal.o parseraction.o terminalfunctions.o swrite.o terminalframebuffer.o terminaldispatcher.o terminaluserinput.o terminaldisplay.o network.o ocb.o base64.o crypto.o
 repos = templates.rpo
-executables = parse termemu ntester
+executables = parse termemu ntester encrypt decrypt
 
 CXX = g++
 CXXFLAGS = -g --std=c++0x -pedantic -Werror -Wall -Wextra -Weffc++ -fno-implicit-templates -fno-default-inline -pipe -D_FILE_OFFSET_BITS=64 -D_XOPEN_SOURCE=500 -D_GNU_SOURCE
-LIBS = -lutil
+LIBS = -lutil -lssl
 
 all: $(executables)
 
@@ -18,6 +18,12 @@ termemu: termemu.o $(objects) parse # serialize link steps because of -frepo
 ntester: ntester.o $(objects) termemu # serialize link steps because of -frepo
 	$(CXX) $(CXXFLAGS) -o $@ ntester.o $(objects) $(LIBS)
 
+encrypt: encrypt.o $(objects) ntester # serialize link steps because of -frepo
+	$(CXX) $(CXXFLAGS) -o $@ encrypt.o $(objects) $(LIBS)
+
+decrypt: decrypt.o $(objects) encrypt # serialize link steps because of -frepo
+	$(CXX) $(CXXFLAGS) -o $@ decrypt.o $(objects) $(LIBS)
+
 templates.o: templates.cpp
 	$(CXX) $(CXXFLAGS) -frepo -c -o $@ $<
 
diff --git a/ae.hpp b/ae.hpp
new file mode 100644
index 0000000..fb5c511
--- /dev/null
+++ b/ae.hpp
@@ -0,0 +1,182 @@
+/* ---------------------------------------------------------------------------
+ *
+ * AEAD API 0.12 - 13 July 2011
+ *
+ * This file gives an interface appropriate for many authenticated
+ * encryption with associated data (AEAD) implementations. It does not try
+ * to accommodate all possible options or limitations that an implementation
+ * might have -- you should consult the documentation of your chosen
+ * implementation to find things like RFC 5116 constants, alignment
+ * requirements, whether the incremental interface is supported, etc.
+ *
+ * This file is in the public domain. It is provided "as is", without
+ * warranty of any kind. Use at your own risk.
+ *
+ * Comments are welcome: Ted Krovetz <ted@krovetz>.
+ *
+ * ------------------------------------------------------------------------ */
+
+#ifndef _AE_H_
+#define _AE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --------------------------------------------------------------------------
+ *
+ * Constants
+ *
+ * ----------------------------------------------------------------------- */
+
+/* Return status codes: Negative return values indicate an error occurred.
+ * For full explanations of error values, consult the implementation's
+ * documentation.                                                          */
+#define AE_SUCCESS       ( 0)  /* Indicates successful completion of call  */
+#define AE_INVALID       (-1)  /* Indicates bad tag during decryption      */
+#define AE_NOT_SUPPORTED (-2)  /* Indicates unsupported option requested   */
+
+/* Flags: When data can be processed "incrementally", these flags are used
+ * to indicate whether the submitted data is the last or not.               */
+#define AE_FINALIZE      (1)   /* This is the last of data                  */
+#define AE_PENDING       (0)   /* More data of is coming                    */
+
+/* --------------------------------------------------------------------------
+ *
+ * AEAD opaque structure definition
+ *
+ * ----------------------------------------------------------------------- */
+
+typedef struct _ae_ctx ae_ctx;
+
+/* --------------------------------------------------------------------------
+ *
+ * Data Structure Routines
+ *
+ * ----------------------------------------------------------------------- */
+
+ae_ctx* ae_allocate  (void *misc);  /* Allocate ae_ctx, set optional ptr   */
+void    ae_free      (ae_ctx *ctx); /* Deallocate ae_ctx struct            */
+int     ae_clear     (ae_ctx *ctx); /* Undo initialization                 */
+int     ae_ctx_sizeof(void);        /* Return sizeof(ae_ctx)               */
+/* ae_allocate() allocates an ae_ctx structure, but does not initialize it.
+ * ae_free() deallocates an ae_ctx structure, but does not zeroize it.
+ * ae_clear() zeroes sensitive values associated with an ae_ctx structure
+ * and deallocates any auxiliary structures allocated during ae_init().
+ * ae_ctx_sizeof() returns sizeof(ae_ctx), to aid in any static allocations.
+ */
+
+/* --------------------------------------------------------------------------
+ *
+ * AEAD Routines
+ *
+ * ----------------------------------------------------------------------- */
+
+int ae_init(ae_ctx     *ctx,
+            const void *key,
+            int         key_len,
+            int         nonce_len,
+            int         tag_len);
+/* --------------------------------------------------------------------------
+ *
+ * Initialize an ae_ctx context structure.
+ *
+ * Parameters:
+ *  ctx       - Pointer to an ae_ctx structure to be initialized
+ *  key       - Pointer to user-supplied key
+ *  key_len   - Length of key supplied, in bytes
+ *  nonce_len - Length of nonces to be used for this key, in bytes
+ *  tag_len   - Length of tags to be produced for this key, in bytes
+ *
+ * Returns:
+ *  AE_SUCCESS       - Success. Ctx ready for use.
+ *  AE_NOT_SUPPORTED - An unsupported length was supplied. Ctx is untouched.
+ *  Otherwise        - Error. Check implementation documentation for codes.
+ *
+ * ----------------------------------------------------------------------- */
+
+int ae_encrypt(ae_ctx     *ctx,
+               const void *nonce,
+               const void *pt,
+               int         pt_len,
+               const void *ad,
+               int         ad_len,
+               void       *ct,
+               void       *tag,
+               int         final);
+/* --------------------------------------------------------------------------
+ *
+ * Encrypt plaintext; provide for authentication of ciphertext/associated data.
+ *
+ * Parameters:
+ *  ctx    - Pointer to an ae_ctx structure initialized by ae_init.
+ *  nonce  - Pointer to a nonce_len (defined in ae_init) byte nonce.
+ *  pt     - Pointer to plaintext bytes to be encrypted.
+ *  pt_len - number of bytes pointed to by pt.
+ *  ad     - Pointer to associated data.
+ *  ad_len - number of bytes pointed to by ad.
+ *  ct     - Pointer to buffer to receive ciphertext encryption.
+ *  tag    - Pointer to receive authentication tag; or NULL
+ *           if tag is to be bundled into the ciphertext.
+ *  final  - Non-zero if this call completes the plaintext being encrypted.
+ *
+ * If nonce!=NULL then a message is being initiated. If final!=0
+ * then a message is being finalized. If final==0 or nonce==NULL
+ * then the incremental interface is being used. If nonce!=NULL and
+ * ad_len<0, then use same ad as last message.
+ *
+ * Returns:
+ *  non-negative     - Number of bytes written to ct.
+ *  AE_NOT_SUPPORTED - Usage mode unsupported (eg, incremental and/or sticky).
+ *  Otherwise        - Error. Check implementation documentation for codes.
+ *
+ * ----------------------------------------------------------------------- */
+
+int ae_decrypt(ae_ctx     *ctx,
+               const void *nonce,
+               const void *ct,
+               int         ct_len,
+               const void *ad,
+               int         ad_len,
+               void       *pt,
+               const void *tag,
+               int         final);
+/* --------------------------------------------------------------------------
+ *
+ * Decrypt ciphertext; provide authenticity of plaintext and associated data.
+ *
+ * Parameters:
+ *  ctx    - Pointer to an ae_ctx structure initialized by ae_init.
+ *  nonce  - Pointer to a nonce_len (defined in ae_init) byte nonce.
+ *  ct     - Pointer to ciphertext bytes to be decrypted.
+ *  ct_len - number of bytes pointed to by ct.
+ *  ad     - Pointer to associated data.
+ *  ad_len - number of bytes pointed to by ad.
+ *  pt     - Pointer to buffer to receive plaintext decryption.
+ *  tag    - Pointer to tag_len (defined in ae_init) bytes; or NULL
+ *           if tag is bundled into the ciphertext. Non-NULL tag is only
+ *           read when final is non-zero.
+ *  final  - Non-zero if this call completes the ciphertext being decrypted.
+ *
+ * If nonce!=NULL then "ct" points to the start of a ciphertext. If final!=0
+ * then "in" points to the final piece of ciphertext. If final==0 or nonce==
+ * NULL then the incremental interface is being used. If nonce!=NULL and
+ * ad_len<0, then use same ad as last message.
+ *
+ * Returns:
+ *  non-negative     - Number of bytes written to pt.
+ *  AE_INVALID       - Authentication failure.
+ *  AE_NOT_SUPPORTED - Usage mode unsupported (eg, incremental and/or sticky).
+ *  Otherwise        - Error. Check implementation documentation for codes.
+ *
+ * NOTE !!! NOTE !!! -- The ciphertext should be assumed possibly inauthentic
+ *                      until it has been completely written and it is
+ *                      verified that this routine did not return AE_INVALID.
+ *
+ * ----------------------------------------------------------------------- */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* _AE_H_ */
diff --git a/base64.cpp b/base64.cpp
new file mode 100644
index 0000000..028e2f4
--- /dev/null
+++ b/base64.cpp
@@ -0,0 +1,577 @@
+/* Taken from GNU coreutils */
+
+/* base64.c -- Encode binary data using printable characters.
+   Copyright (C) 1999-2001, 2004-2006, 2009-2011 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+/* Written by Simon Josefsson.  Partially adapted from GNU MailUtils
+ * (mailbox/filter_trans.c, as of 2004-11-28).  Improved by review
+ * from Paul Eggert, Bruno Haible, and Stepan Kasal.
+ *
+ * See also RFC 3548 <http://www.ietf.org/rfc/rfc3548.txt>.
+ *
+ * Be careful with error checking.  Here is how you would typically
+ * use these functions:
+ *
+ * bool ok = base64_decode_alloc (in, inlen, &out, &outlen);
+ * if (!ok)
+ *   FAIL: input was not valid base64
+ * if (out == NULL)
+ *   FAIL: memory allocation error
+ * OK: data in OUT/OUTLEN
+ *
+ * size_t outlen = base64_encode_alloc (in, inlen, &out);
+ * if (out == NULL && outlen == 0 && inlen != 0)
+ *   FAIL: input too long
+ * if (out == NULL)
+ *   FAIL: memory allocation error
+ * OK: data in OUT/OUTLEN.
+ *
+ */
+
+// #include <config.h>
+
+/* Get prototype. */
+#include "base64.h"
+
+/* Get malloc. */
+#include <stdlib.h>
+
+/* Get UCHAR_MAX. */
+#include <limits.h>
+
+#include <string.h>
+
+/* C89 compliant way to cast 'char' to 'unsigned char'. */
+static inline unsigned char
+to_uchar (char ch)
+{
+  return ch;
+}
+
+/* Base64 encode IN array of size INLEN into OUT array of size OUTLEN.
+   If OUTLEN is less than BASE64_LENGTH(INLEN), write as many bytes as
+   possible.  If OUTLEN is larger than BASE64_LENGTH(INLEN), also zero
+   terminate the output buffer. */
+void
+base64_encode (const char *restrict in, size_t inlen,
+               char *restrict out, size_t outlen)
+{
+  static const char b64str[65] = /* KJW */
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+  while (inlen && outlen)
+    {
+      *out++ = b64str[(to_uchar (in[0]) >> 2) & 0x3f];
+      if (!--outlen)
+        break;
+      *out++ = b64str[((to_uchar (in[0]) << 4)
+                       + (--inlen ? to_uchar (in[1]) >> 4 : 0))
+                      & 0x3f];
+      if (!--outlen)
+        break;
+      *out++ =
+        (inlen
+         ? b64str[((to_uchar (in[1]) << 2)
+                   + (--inlen ? to_uchar (in[2]) >> 6 : 0))
+                  & 0x3f]
+         : '=');
+      if (!--outlen)
+        break;
+      *out++ = inlen ? b64str[to_uchar (in[2]) & 0x3f] : '=';
+      if (!--outlen)
+        break;
+      if (inlen)
+        inlen--;
+      if (inlen)
+        in += 3;
+    }
+
+  if (outlen)
+    *out = '\0';
+}
+
+/* Allocate a buffer and store zero terminated base64 encoded data
+   from array IN of size INLEN, returning BASE64_LENGTH(INLEN), i.e.,
+   the length of the encoded data, excluding the terminating zero.  On
+   return, the OUT variable will hold a pointer to newly allocated
+   memory that must be deallocated by the caller.  If output string
+   length would overflow, 0 is returned and OUT is set to NULL.  If
+   memory allocation failed, OUT is set to NULL, and the return value
+   indicates length of the requested memory block, i.e.,
+   BASE64_LENGTH(inlen) + 1. */
+size_t
+base64_encode_alloc (const char *in, size_t inlen, char **out)
+{
+  size_t outlen = 1 + BASE64_LENGTH (inlen);
+
+  /* Check for overflow in outlen computation.
+   *
+   * If there is no overflow, outlen >= inlen.
+   *
+   * If the operation (inlen + 2) overflows then it yields at most +1, so
+   * outlen is 0.
+   *
+   * If the multiplication overflows, we lose at least half of the
+   * correct value, so the result is < ((inlen + 2) / 3) * 2, which is
+   * less than (inlen + 2) * 0.66667, which is less than inlen as soon as
+   * (inlen > 4).
+   */
+  if (inlen > outlen)
+    {
+      *out = NULL;
+      return 0;
+    }
+
+  *out = (char *) malloc (outlen); /* KJW */
+  if (!*out)
+    return outlen;
+
+  base64_encode (in, inlen, *out, outlen);
+
+  return outlen - 1;
+}
+
+/* With this approach this file works independent of the charset used
+   (think EBCDIC).  However, it does assume that the characters in the
+   Base64 alphabet (A-Za-z0-9+/) are encoded in 0..255.  POSIX
+   1003.1-2001 require that char and unsigned char are 8-bit
+   quantities, though, taking care of that problem.  But this may be a
+   potential problem on non-POSIX C99 platforms.
+
+   IBM C V6 for AIX mishandles "#define B64(x) ...'x'...", so use "_"
+   as the formal parameter rather than "x".  */
+#define B64(_)                                  \
+  ((_) == 'A' ? 0                               \
+   : (_) == 'B' ? 1                             \
+   : (_) == 'C' ? 2                             \
+   : (_) == 'D' ? 3                             \
+   : (_) == 'E' ? 4                             \
+   : (_) == 'F' ? 5                             \
+   : (_) == 'G' ? 6                             \
+   : (_) == 'H' ? 7                             \
+   : (_) == 'I' ? 8                             \
+   : (_) == 'J' ? 9                             \
+   : (_) == 'K' ? 10                            \
+   : (_) == 'L' ? 11                            \
+   : (_) == 'M' ? 12                            \
+   : (_) == 'N' ? 13                            \
+   : (_) == 'O' ? 14                            \
+   : (_) == 'P' ? 15                            \
+   : (_) == 'Q' ? 16                            \
+   : (_) == 'R' ? 17                            \
+   : (_) == 'S' ? 18                            \
+   : (_) == 'T' ? 19                            \
+   : (_) == 'U' ? 20                            \
+   : (_) == 'V' ? 21                            \
+   : (_) == 'W' ? 22                            \
+   : (_) == 'X' ? 23                            \
+   : (_) == 'Y' ? 24                            \
+   : (_) == 'Z' ? 25                            \
+   : (_) == 'a' ? 26                            \
+   : (_) == 'b' ? 27                            \
+   : (_) == 'c' ? 28                            \
+   : (_) == 'd' ? 29                            \
+   : (_) == 'e' ? 30                            \
+   : (_) == 'f' ? 31                            \
+   : (_) == 'g' ? 32                            \
+   : (_) == 'h' ? 33                            \
+   : (_) == 'i' ? 34                            \
+   : (_) == 'j' ? 35                            \
+   : (_) == 'k' ? 36                            \
+   : (_) == 'l' ? 37                            \
+   : (_) == 'm' ? 38                            \
+   : (_) == 'n' ? 39                            \
+   : (_) == 'o' ? 40                            \
+   : (_) == 'p' ? 41                            \
+   : (_) == 'q' ? 42                            \
+   : (_) == 'r' ? 43                            \
+   : (_) == 's' ? 44                            \
+   : (_) == 't' ? 45                            \
+   : (_) == 'u' ? 46                            \
+   : (_) == 'v' ? 47                            \
+   : (_) == 'w' ? 48                            \
+   : (_) == 'x' ? 49                            \
+   : (_) == 'y' ? 50                            \
+   : (_) == 'z' ? 51                            \
+   : (_) == '0' ? 52                            \
+   : (_) == '1' ? 53                            \
+   : (_) == '2' ? 54                            \
+   : (_) == '3' ? 55                            \
+   : (_) == '4' ? 56                            \
+   : (_) == '5' ? 57                            \
+   : (_) == '6' ? 58                            \
+   : (_) == '7' ? 59                            \
+   : (_) == '8' ? 60                            \
+   : (_) == '9' ? 61                            \
+   : (_) == '+' ? 62                            \
+   : (_) == '/' ? 63                            \
+   : -1)
+
+static const signed char b64[0x100] = {
+  B64 (0), B64 (1), B64 (2), B64 (3),
+  B64 (4), B64 (5), B64 (6), B64 (7),
+  B64 (8), B64 (9), B64 (10), B64 (11),
+  B64 (12), B64 (13), B64 (14), B64 (15),
+  B64 (16), B64 (17), B64 (18), B64 (19),
+  B64 (20), B64 (21), B64 (22), B64 (23),
+  B64 (24), B64 (25), B64 (26), B64 (27),
+  B64 (28), B64 (29), B64 (30), B64 (31),
+  B64 (32), B64 (33), B64 (34), B64 (35),
+  B64 (36), B64 (37), B64 (38), B64 (39),
+  B64 (40), B64 (41), B64 (42), B64 (43),
+  B64 (44), B64 (45), B64 (46), B64 (47),
+  B64 (48), B64 (49), B64 (50), B64 (51),
+  B64 (52), B64 (53), B64 (54), B64 (55),
+  B64 (56), B64 (57), B64 (58), B64 (59),
+  B64 (60), B64 (61), B64 (62), B64 (63),
+  B64 (64), B64 (65), B64 (66), B64 (67),
+  B64 (68), B64 (69), B64 (70), B64 (71),
+  B64 (72), B64 (73), B64 (74), B64 (75),
+  B64 (76), B64 (77), B64 (78), B64 (79),
+  B64 (80), B64 (81), B64 (82), B64 (83),
+  B64 (84), B64 (85), B64 (86), B64 (87),
+  B64 (88), B64 (89), B64 (90), B64 (91),
+  B64 (92), B64 (93), B64 (94), B64 (95),
+  B64 (96), B64 (97), B64 (98), B64 (99),
+  B64 (100), B64 (101), B64 (102), B64 (103),
+  B64 (104), B64 (105), B64 (106), B64 (107),
+  B64 (108), B64 (109), B64 (110), B64 (111),
+  B64 (112), B64 (113), B64 (114), B64 (115),
+  B64 (116), B64 (117), B64 (118), B64 (119),
+  B64 (120), B64 (121), B64 (122), B64 (123),
+  B64 (124), B64 (125), B64 (126), B64 (127),
+  B64 (128), B64 (129), B64 (130), B64 (131),
+  B64 (132), B64 (133), B64 (134), B64 (135),
+  B64 (136), B64 (137), B64 (138), B64 (139),
+  B64 (140), B64 (141), B64 (142), B64 (143),
+  B64 (144), B64 (145), B64 (146), B64 (147),
+  B64 (148), B64 (149), B64 (150), B64 (151),
+  B64 (152), B64 (153), B64 (154), B64 (155),
+  B64 (156), B64 (157), B64 (158), B64 (159),
+  B64 (160), B64 (161), B64 (162), B64 (163),
+  B64 (164), B64 (165), B64 (166), B64 (167),
+  B64 (168), B64 (169), B64 (170), B64 (171),
+  B64 (172), B64 (173), B64 (174), B64 (175),
+  B64 (176), B64 (177), B64 (178), B64 (179),
+  B64 (180), B64 (181), B64 (182), B64 (183),
+  B64 (184), B64 (185), B64 (186), B64 (187),
+  B64 (188), B64 (189), B64 (190), B64 (191),
+  B64 (192), B64 (193), B64 (194), B64 (195),
+  B64 (196), B64 (197), B64 (198), B64 (199),
+  B64 (200), B64 (201), B64 (202), B64 (203),
+  B64 (204), B64 (205), B64 (206), B64 (207),
+  B64 (208), B64 (209), B64 (210), B64 (211),
+  B64 (212), B64 (213), B64 (214), B64 (215),
+  B64 (216), B64 (217), B64 (218), B64 (219),
+  B64 (220), B64 (221), B64 (222), B64 (223),
+  B64 (224), B64 (225), B64 (226), B64 (227),
+  B64 (228), B64 (229), B64 (230), B64 (231),
+  B64 (232), B64 (233), B64 (234), B64 (235),
+  B64 (236), B64 (237), B64 (238), B64 (239),
+  B64 (240), B64 (241), B64 (242), B64 (243),
+  B64 (244), B64 (245), B64 (246), B64 (247),
+  B64 (248), B64 (249), B64 (250), B64 (251),
+  B64 (252), B64 (253), B64 (254), B64 (255)
+};
+
+#if UCHAR_MAX == 255
+# define uchar_in_range(c) true
+#else
+# define uchar_in_range(c) ((c) <= 255)
+#endif
+
+/* Return true if CH is a character from the Base64 alphabet, and
+   false otherwise.  Note that '=' is padding and not considered to be
+   part of the alphabet.  */
+bool
+isbase64 (char ch)
+{
+  return uchar_in_range (to_uchar (ch)) && 0 <= b64[to_uchar (ch)];
+}
+
+/* Initialize decode-context buffer, CTX.  */
+void
+base64_decode_ctx_init (struct base64_decode_context *ctx)
+{
+  ctx->i = 0;
+}
+
+/* If CTX->i is 0 or 4, there are four or more bytes in [*IN..IN_END), and
+   none of those four is a newline, then return *IN.  Otherwise, copy up to
+   4 - CTX->i non-newline bytes from that range into CTX->buf, starting at
+   index CTX->i and setting CTX->i to reflect the number of bytes copied,
+   and return CTX->buf.  In either case, advance *IN to point to the byte
+   after the last one processed, and set *N_NON_NEWLINE to the number of
+   verified non-newline bytes accessible through the returned pointer.  */
+static inline char *
+get_4 (struct base64_decode_context *ctx,
+       char const *restrict *in, char const *restrict in_end,
+       size_t *n_non_newline)
+{
+  if (ctx->i == 4)
+    ctx->i = 0;
+
+  if (ctx->i == 0)
+    {
+      char const *t = *in;
+      if (4 <= in_end - *in && memchr (t, '\n', 4) == NULL)
+        {
+          /* This is the common case: no newline.  */
+          *in += 4;
+          *n_non_newline = 4;
+          return (char *) t;
+        }
+    }
+
+  {
+    /* Copy non-newline bytes into BUF.  */
+    char const *p = *in;
+    while (p < in_end)
+      {
+        char c = *p++;
+        if (c != '\n')
+          {
+            ctx->buf[ctx->i++] = c;
+            if (ctx->i == 4)
+              break;
+          }
+      }
+
+    *in = p;
+    *n_non_newline = ctx->i;
+    return ctx->buf;
+  }
+}
+
+#define return_false                            \
+  do                                            \
+    {                                           \
+      *outp = out;                              \
+      return false;                             \
+    }                                           \
+  while (false)
+
+/* Decode up to four bytes of base64-encoded data, IN, of length INLEN
+   into the output buffer, *OUT, of size *OUTLEN bytes.  Return true if
+   decoding is successful, false otherwise.  If *OUTLEN is too small,
+   as many bytes as possible are written to *OUT.  On return, advance
+   *OUT to point to the byte after the last one written, and decrement
+   *OUTLEN to reflect the number of bytes remaining in *OUT.  */
+static inline bool
+decode_4 (char const *restrict in, size_t inlen,
+          char *restrict *outp, size_t *outleft)
+{
+  char *out = *outp;
+  if (inlen < 2)
+    return false;
+
+  if (!isbase64 (in[0]) || !isbase64 (in[1]))
+    return false;
+
+  if (*outleft)
+    {
+      *out++ = ((b64[to_uchar (in[0])] << 2)
+                | (b64[to_uchar (in[1])] >> 4));
+      --*outleft;
+    }
+
+  if (inlen == 2)
+    return_false;
+
+  if (in[2] == '=')
+    {
+      if (inlen != 4)
+        return_false;
+
+      if (in[3] != '=')
+        return_false;
+    }
+  else
+    {
+      if (!isbase64 (in[2]))
+        return_false;
+
+      if (*outleft)
+        {
+          *out++ = (((b64[to_uchar (in[1])] << 4) & 0xf0)
+                    | (b64[to_uchar (in[2])] >> 2));
+          --*outleft;
+        }
+
+      if (inlen == 3)
+        return_false;
+
+      if (in[3] == '=')
+        {
+          if (inlen != 4)
+            return_false;
+        }
+      else
+        {
+          if (!isbase64 (in[3]))
+            return_false;
+
+          if (*outleft)
+            {
+              *out++ = (((b64[to_uchar (in[2])] << 6) & 0xc0)
+                        | b64[to_uchar (in[3])]);
+              --*outleft;
+            }
+        }
+    }
+
+  *outp = out;
+  return true;
+}
+
+/* Decode base64-encoded input array IN of length INLEN to output array
+   OUT that can hold *OUTLEN bytes.  The input data may be interspersed
+   with newlines.  Return true if decoding was successful, i.e. if the
+   input was valid base64 data, false otherwise.  If *OUTLEN is too
+   small, as many bytes as possible will be written to OUT.  On return,
+   *OUTLEN holds the length of decoded bytes in OUT.  Note that as soon
+   as any non-alphabet, non-newline character is encountered, decoding
+   is stopped and false is returned.  If INLEN is zero, then process
+   only whatever data is stored in CTX.
+
+   Initially, CTX must have been initialized via base64_decode_ctx_init.
+   Subsequent calls to this function must reuse whatever state is recorded
+   in that buffer.  It is necessary for when a quadruple of base64 input
+   bytes spans two input buffers.
+
+   If CTX is NULL then newlines are treated as garbage and the input
+   buffer is processed as a unit.  */
+
+bool
+base64_decode_ctx (struct base64_decode_context *ctx,
+                   const char *restrict in, size_t inlen,
+                   char *restrict out, size_t *outlen)
+{
+  size_t outleft = *outlen;
+  bool ignore_newlines = ctx != NULL;
+  bool flush_ctx = false;
+  unsigned int ctx_i = 0;
+
+  if (ignore_newlines)
+    {
+      ctx_i = ctx->i;
+      flush_ctx = inlen == 0;
+    }
+
+
+  while (true)
+    {
+      size_t outleft_save = outleft;
+      if (ctx_i == 0 && !flush_ctx)
+        {
+          while (true)
+            {
+              /* Save a copy of outleft, in case we need to re-parse this
+                 block of four bytes.  */
+              outleft_save = outleft;
+              if (!decode_4 (in, inlen, &out, &outleft))
+                break;
+
+              in += 4;
+              inlen -= 4;
+            }
+        }
+
+      if (inlen == 0 && !flush_ctx)
+        break;
+
+      /* Handle the common case of 72-byte wrapped lines.
+         This also handles any other multiple-of-4-byte wrapping.  */
+      if (inlen && *in == '\n' && ignore_newlines)
+        {
+          ++in;
+          --inlen;
+          continue;
+        }
+
+      /* Restore OUT and OUTLEFT.  */
+      out -= outleft_save - outleft;
+      outleft = outleft_save;
+
+      {
+        char const *in_end = in + inlen;
+        char const *non_nl;
+
+        if (ignore_newlines)
+          non_nl = get_4 (ctx, &in, in_end, &inlen);
+        else
+          non_nl = in;  /* Might have nl in this case. */
+
+        /* If the input is empty or consists solely of newlines (0 non-newlines),
+           then we're done.  Likewise if there are fewer than 4 bytes when not
+           flushing context and not treating newlines as garbage.  */
+        if (inlen == 0 || (inlen < 4 && !flush_ctx && ignore_newlines))
+          {
+            inlen = 0;
+            break;
+          }
+        if (!decode_4 (non_nl, inlen, &out, &outleft))
+          break;
+
+        inlen = in_end - in;
+      }
+    }
+
+  *outlen -= outleft;
+
+  return inlen == 0;
+}
+
+/* Allocate an output buffer in *OUT, and decode the base64 encoded
+   data stored in IN of size INLEN to the *OUT buffer.  On return, the
+   size of the decoded data is stored in *OUTLEN.  OUTLEN may be NULL,
+   if the caller is not interested in the decoded length.  *OUT may be
+   NULL to indicate an out of memory error, in which case *OUTLEN
+   contains the size of the memory block needed.  The function returns
+   true on successful decoding and memory allocation errors.  (Use the
+   *OUT and *OUTLEN parameters to differentiate between successful
+   decoding and memory error.)  The function returns false if the
+   input was invalid, in which case *OUT is NULL and *OUTLEN is
+   undefined. */
+bool
+base64_decode_alloc_ctx (struct base64_decode_context *ctx,
+                         const char *in, size_t inlen, char **out,
+                         size_t *outlen)
+{
+  /* This may allocate a few bytes too many, depending on input,
+     but it's not worth the extra CPU time to compute the exact size.
+     The exact size is 3 * inlen / 4, minus 1 if the input ends
+     with "=" and minus another 1 if the input ends with "==".
+     Dividing before multiplying avoids the possibility of overflow.  */
+  size_t needlen = 3 * (inlen / 4) + 2;
+
+  *out = (char *) malloc (needlen);
+  if (!*out)
+    return true;
+
+  if (!base64_decode_ctx (ctx, in, inlen, *out, &needlen))
+    {
+      free (*out);
+      *out = NULL;
+      return false;
+    }
+
+  if (outlen)
+    *outlen = needlen;
+
+  return true;
+}
diff --git a/base64.h b/base64.h
new file mode 100644
index 0000000..8efa678
--- /dev/null
+++ b/base64.h
@@ -0,0 +1,65 @@
+/* Taken from GNU coreutils */
+
+#define restrict
+
+/* base64.h -- Encode binary data using printable characters.
+   Copyright (C) 2004-2006, 2009-2011 Free Software Foundation, Inc.
+   Written by Simon Josefsson.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef BASE64_H
+# define BASE64_H
+
+/* Get size_t. */
+# include <stddef.h>
+
+/* Get bool. */
+# include <stdbool.h>
+
+/* This uses that the expression (n+(k-1))/k means the smallest
+   integer >= n/k, i.e., the ceiling of n/k.  */
+# define BASE64_LENGTH(inlen) ((((inlen) + 2) / 3) * 4)
+
+struct base64_decode_context
+{
+  unsigned int i;
+  char buf[4];
+};
+
+extern bool isbase64 (char ch);
+
+extern void base64_encode (const char *restrict in, size_t inlen,
+                           char *restrict out, size_t outlen);
+
+extern size_t base64_encode_alloc (const char *in, size_t inlen, char **out);
+
+extern void base64_decode_ctx_init (struct base64_decode_context *ctx);
+
+extern bool base64_decode_ctx (struct base64_decode_context *ctx,
+                               const char *restrict in, size_t inlen,
+                               char *restrict out, size_t *outlen);
+
+extern bool base64_decode_alloc_ctx (struct base64_decode_context *ctx,
+                                     const char *in, size_t inlen,
+                                     char **out, size_t *outlen);
+
+#define base64_decode(in, inlen, out, outlen) \
+        base64_decode_ctx (NULL, in, inlen, out, outlen)
+
+#define base64_decode_alloc(in, inlen, out, outlen) \
+        base64_decode_alloc_ctx (NULL, in, inlen, out, outlen)
+
+#endif /* BASE64_H */
diff --git a/crypto.cpp b/crypto.cpp
new file mode 100644
index 0000000..6431328
--- /dev/null
+++ b/crypto.cpp
@@ -0,0 +1,211 @@
+#include <string.h>
+#include <stdio.h>
+
+#include "crypto.hpp"
+#include "base64.h"
+
+using namespace std;
+
+const char rdev[] = "/dev/urandom";
+
+static void * sse_alloc( int len )
+{
+  void *ptr = NULL;
+
+  if( (0 != posix_memalign( (void **)&ptr, 16, len )) || (ptr == NULL) ) {
+    throw std::bad_alloc();
+  }
+
+  return ptr;
+}
+
+Base64Key::Base64Key( string printable_key )
+{
+  if ( printable_key.length() != 22 ) {
+    throw CryptoException( "Key must be 22 letters long." );
+  }
+
+  string base64 = printable_key + "==";
+
+  size_t len = 16;
+  if ( !base64_decode( base64.data(), 24, (char *)&key[ 0 ], &len ) ) {
+    throw CryptoException( "Key must be well-formed base64." );
+  }
+
+  if ( len != 16 ) {
+    throw CryptoException( "Key must represent 16 octets." );
+  }
+
+  /* to catch changes after the first 128 bits */
+  if ( printable_key != this->printable_key() ) {
+    throw CryptoException( "Base64 key was not encoded 128-bit key." );
+  }
+}
+
+Base64Key::Base64Key()
+{
+  FILE *devrandom = fopen( rdev, "r" );
+  if ( devrandom == NULL ) {
+    throw CryptoException( string( rdev ) + ": " + strerror( errno ) );
+  }
+
+  if ( 1 != fread( key, 16, 1, devrandom ) ) {
+    throw CryptoException( "Could not read from " + string( rdev ) );
+  }
+
+  if ( 0 != fclose( devrandom ) ) {
+    throw CryptoException( string( rdev ) + ": " + strerror( errno ) );
+  }
+}
+
+string Base64Key::printable_key( void )
+{
+  char base64[ 25 ];
+  
+  base64_encode( (char *)key, 16, base64, 25 );
+
+  if ( (base64[ 24 ] != 0)
+       || (base64[ 23 ] != '=')
+       || (base64[ 22 ] != '=') ) {
+    throw CryptoException( "Unexpected output from base64_encode." );
+  }
+
+  base64[ 22 ] = 0;
+  return string( base64 );
+}
+
+Session::Session( Base64Key s_key )
+  : key( s_key ), ctx( NULL )
+{
+  ctx = ae_allocate( NULL );
+  if ( ctx == NULL ) {
+    throw CryptoException( "Could not allocate AES-OCB context." );
+  }
+
+  if ( AE_SUCCESS != ae_init( ctx, key.data(), 16, 12, 16 ) ) {
+    throw CryptoException( "Could not initialize AES-OCB context." );
+  }
+}
+
+Session::~Session()
+{
+  if ( ae_clear( ctx ) != AE_SUCCESS ) {
+    throw CryptoException( "Could not clear AES-OCB context." );
+  }
+
+  ae_free( ctx );
+}
+
+Nonce::Nonce( uint64_t val )
+{
+  uint64_t val_net = htobe64( val );
+
+  memset( bytes, 0, 4 );
+  memcpy( bytes + 4, &val_net, 8 );
+}
+
+uint64_t Nonce::val( void )
+{
+  uint64_t ret;
+  memcpy( &ret, bytes + 4, 8 );
+  return be64toh( ret );
+}
+
+Nonce::Nonce( char *s_bytes, size_t len )
+{
+  if ( len != 8 ) {
+    throw CryptoException( "Nonce representation must be 8 octets long." );
+  }
+
+  memset( bytes, 0, 4 );
+  memcpy( bytes + 4, s_bytes, 8 );
+}
+
+Message::Message( char *nonce_bytes, size_t nonce_len,
+		  char *text_bytes, size_t text_len )
+  : nonce( nonce_bytes, nonce_len ),
+    text( (char *)text_bytes, text_len )
+{}
+
+Message::Message( Nonce s_nonce, string s_text )
+  : nonce( s_nonce ),
+    text( s_text )
+{}
+
+string Session::encrypt( Message plaintext )
+{
+  const size_t pt_len = plaintext.text.size();
+  const int ciphertext_len = pt_len + 16;
+
+  char *ciphertext = (char *)sse_alloc( ciphertext_len );
+  char *pt = (char *)sse_alloc( pt_len );
+
+  memcpy( pt, plaintext.text.data(), plaintext.text.size() );
+
+  if ( (uint64_t( plaintext.nonce.data() ) & 0xf) != 0 ) {
+    throw CryptoException( "Bad alignment." );
+  }
+
+  if ( ciphertext_len != ae_encrypt( ctx,                                     /* ctx */
+				     plaintext.nonce.data(),                  /* nonce */
+				     pt,                                      /* pt */
+				     pt_len,                                  /* pt_len */
+				     NULL,                                    /* ad */
+				     0,                                       /* ad_len */
+				     ciphertext,                              /* ct */
+				     NULL,                                    /* tag */
+				     AE_FINALIZE ) ) {                        /* final */
+    free( pt );
+    free( ciphertext );
+    throw CryptoException( "ae_encrypt() returned error." );
+  }
+
+  string text( (char *)ciphertext, ciphertext_len );
+  free( pt );
+  free( ciphertext );
+
+  return plaintext.nonce.cpp_str() + text;
+}
+
+Message Session::decrypt( string ciphertext )
+{
+  if ( ciphertext.size() < 24 ) {
+    throw CryptoException( "Ciphertext must contain nonce and tag." );
+  }
+
+  char *str = (char *)ciphertext.data();
+
+  int body_len = ciphertext.size() - 8;
+  int pt_len = body_len - 16;
+
+  if ( pt_len <= 0 ) { /* super-assertion that does not equal AE_INVALID */
+    fprintf( stderr, "BUG.\n" );
+    exit( 1 );
+  }
+
+  Nonce __attribute__((__aligned__ (16))) nonce( str, 8 );
+  char *body = (char *)sse_alloc( body_len );
+  memcpy( body, str + 8, body_len );
+
+  char *plaintext = (char *)sse_alloc( pt_len );
+
+  if ( pt_len != ae_decrypt( ctx,               /* ctx */
+			     nonce.data(),      /* nonce */
+			     body,              /* ct */
+			     body_len,          /* ct_len */
+			     NULL,              /* ad */
+			     0,                 /* ad_len */
+			     plaintext,         /* pt */
+			     NULL,              /* tag */
+			     AE_FINALIZE ) ) {  /* final */
+    free( plaintext );
+    free( body );
+    throw CryptoException( "ae_decrypt() returned error." );
+  }
+
+  Message ret( nonce, string( plaintext, pt_len ) );
+  free( plaintext );
+  free( body );
+
+  return ret;
+}
diff --git a/crypto.hpp b/crypto.hpp
new file mode 100644
index 0000000..e3be49f
--- /dev/null
+++ b/crypto.hpp
@@ -0,0 +1,65 @@
+#ifndef CRYPTO_HPP
+#define CRYPTO_HPP
+
+#include "ae.hpp"
+#include <string>
+
+using namespace std;
+
+class CryptoException {
+public:
+  string text;
+  CryptoException( string s_text ) : text( s_text ) {};
+};
+
+class Base64Key {
+private:
+  unsigned char key[ 16 ];
+
+public:
+  Base64Key(); /* random key */
+  Base64Key( string printable_key );
+  string printable_key( void );
+  unsigned char *data( void ) { return key; }
+};
+
+class Nonce {
+private:
+  char bytes[ 12 ];
+
+public:
+  Nonce( uint64_t val );
+  Nonce( char *s_bytes, size_t len );
+
+  string cpp_str( void ) { return string( (char *)( bytes + 4 ), 8 ); }
+  char *data( void ) { return bytes; }
+  uint64_t val( void );
+};
+
+class Message {
+public:
+  Nonce nonce;
+  string text;
+
+  Message( char *nonce_bytes, size_t nonce_len,
+	   char *text_bytes, size_t text_len );
+  Message( Nonce s_nonce, string s_text );
+};
+
+class Session {
+private:
+  Base64Key key;
+  ae_ctx *ctx;
+
+public:
+  Session( Base64Key s_key );
+  ~Session();
+
+  string encrypt( Message plaintext );
+  Message decrypt( string ciphertext );
+
+  Session( const Session & );
+  Session & operator=( const Session & );
+};
+
+#endif
diff --git a/decrypt.cpp b/decrypt.cpp
new file mode 100644
index 0000000..5f2d13b
--- /dev/null
+++ b/decrypt.cpp
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <iostream>
+
+#include "crypto.hpp"
+
+int main( int argc, char *argv[] )
+{
+  if ( argc != 2 ) {
+    fprintf( stderr, "Usage: %s KEY\n", argv[ 0 ] );
+    return 1;
+  }
+
+  try {
+    Base64Key key( argv[ 1 ] );
+    Session session( key );
+
+    /* Read input */
+    char *input = NULL;
+    int total_size = 0;
+
+    while ( 1 ) {
+      unsigned char buf[ 16384 ];
+      ssize_t bytes_read = read( STDIN_FILENO, buf, 16384 );
+      if ( bytes_read == 0 ) { /* EOF */
+	break;
+      } else if ( bytes_read < 0 ) {
+	perror( "read" );
+	exit( 1 );
+      } else {
+	input = (char *)realloc( input, total_size + bytes_read );
+	assert( input );
+	memcpy( input + total_size, buf, bytes_read );
+	total_size += bytes_read;
+      }
+    }
+
+    string ciphertext( input, total_size );
+    free( input );
+
+    /* Decrypt message */
+
+    Message message = session.decrypt( ciphertext );
+
+    fprintf( stderr, "Nonce = %ld\n",
+	     message.nonce.val() );
+    cout << message.text;
+  } catch ( CryptoException e ) {
+    cerr << e.text << endl;
+    exit( 1 );
+  }
+
+  return 0;
+}
diff --git a/encrypt.cpp b/encrypt.cpp
new file mode 100644
index 0000000..b6d6d95
--- /dev/null
+++ b/encrypt.cpp
@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <iostream>
+
+#include "crypto.hpp"
+
+long int myatoi( char *str )
+{
+  char *end;
+
+  errno = 0;
+  long int ret = strtol( str, &end, 10 );
+
+  if ( ( errno != 0 )
+       || ( end != str + strlen( str ) ) ) {
+    throw CryptoException( "Bad integer." );
+  }
+
+  return ret;
+}
+
+int main( int argc, char *argv[] )
+{
+  if ( argc != 2 ) {
+    fprintf( stderr, "Usage: %s NONCE\n", argv[ 0 ] );
+    return 1;
+  }
+
+  try {
+    Base64Key key;
+    Session session( key );
+    Nonce nonce( myatoi( argv[ 1 ] ) );
+
+    /* Read input */
+    char *input = NULL;
+    int total_size = 0;
+
+    while ( 1 ) {
+      unsigned char buf[ 16384 ];
+      ssize_t bytes_read = read( STDIN_FILENO, buf, 16384 );
+      if ( bytes_read == 0 ) { /* EOF */
+	break;
+      } else if ( bytes_read < 0 ) {
+	perror( "read" );
+	exit( 1 );
+      } else {
+	input = (char *)realloc( input, total_size + bytes_read );
+	assert( input );
+	memcpy( input + total_size, buf, bytes_read );
+	total_size += bytes_read;
+      }
+    }
+
+    string plaintext( input, total_size );
+    free( input );
+
+    /* Encrypt message */
+
+    string ciphertext = session.encrypt( Message( nonce, plaintext ) );
+
+    cerr << "Key: " << key.printable_key() << endl;
+
+    cout << ciphertext;
+  } catch ( CryptoException e ) {
+    cerr << e.text << endl;
+    exit( 1 );
+  }
+
+  return 0;
+}
diff --git a/grant.htm b/grant.htm
new file mode 100644
index 0000000..707d968
--- /dev/null
+++ b/grant.htm
@@ -0,0 +1,38 @@
+<TITLE>OCB - An Authenticated-Encryption Scheme - GPL Patent Grant - Rogaway</TITLE>
+
+<body bgcolor="#FFFFFF">
+<H2><a name="ocb-grant"> <font face="Arial, Helvetica, sans-serif" size="6" color="#FF0000">OCB: 
+  Patent Grant for GNU GPL</font> </a> </H2>
+
+Whereas I, Phillip Rogaway (hereinafter "Inventor") have sought 
+patent protection for certain technology 
+(hereinafter "Patented Technology"), 
+and Inventor wishes to aid the Free Software Foundation in achieving its goals, 
+and Inventor wishes to increase public awareness of Patented Technology, 
+Inventor hereby grants a fully paid-up, nonexclusive, 
+royalty-free license to 
+practice any patents claiming priority to the 
+patent applications below ("the Patents") 
+if practiced by
+software distributed 
+under the terms of any version of 
+the GNU General Public License as published by the Free Software Foundation, 
+59 Temple Place, Suite 330, Boston, MA 02111. 
+Inventor reserves all other rights, including without limitation
+licensing for software not distributed under the GNU General Public License. 
+
+<h4>The patents:</h4>
+
+
+<ul>
+<li> <a href="http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=2&f=G&l=50&co1=AND&d=PG01&s1=rogaway.IN.&OS=IN/rogaway&RS=IN/rogaway">
+09/918,615</a>  -
+Method and Apparatus for Facilitating Efficient Authenticated Encryption.
+
+<li> <a href="http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=3&f=G&l=50&co1=AND&d=PG01&s1=rogaway.IN.&OS=IN/rogaway&RS=IN/rogaway">
+09/948,084</a> - 
+Method and Apparatus for Realizing a Parallelizable Variable-Input-Length 
+Pseudorandom Function. 
+</ul>
+
+
diff --git a/ocb.cpp b/ocb.cpp
new file mode 100644
index 0000000..3ac86cf
--- /dev/null
+++ b/ocb.cpp
@@ -0,0 +1,1226 @@
+/*------------------------------------------------------------------------
+/ OCB Version 3 Reference Code (Optimized C)     Last modified 13-JUL-2011
+/-------------------------------------------------------------------------
+/ Copyright (c) 2011 Ted Krovetz.
+/
+/ Permission to use, copy, modify, and/or distribute this software for any
+/ purpose with or without fee is hereby granted, provided that the above
+/ copyright notice and this permission notice appear in all copies.
+/
+/ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+/ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+/ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+/ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+/ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+/ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+/ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+/
+/ Phillip Rogaway holds patents relevant to OCB. See the following for
+/ his patent grant: http://www.cs.ucdavis.edu/~rogaway/ocb/grant.htm
+/
+/ Comments are welcome: Ted Krovetz <ted@krovetz.net> - Dedicated to Laurel K
+/------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Usage notes                                                             */
+/* ----------------------------------------------------------------------- */
+
+/* - When AE_PENDING is passed as the 'final' parameter of any function,
+/    the length parameters must be a multiple of (BPI*16).
+/  - When available, SSE or AltiVec registers are used to manipulate data.
+/    So, when on machines with these facilities, all pointers passed to
+/    any function should be 16-byte aligned.
+/  - Plaintext and ciphertext pointers may be equal (ie, plaintext gets
+/    encrypted in-place), but no other pair of pointers may be equal.      
+/  - This code assumes all x86 processors have SSE2 and SSSE3 instructions
+/    when compiling under MSVC. If untrue, alter the #define.
+/  - This code is tested for C99 and recent versions of GCC and MSVC.      */
+
+/* ----------------------------------------------------------------------- */
+/* User configuration options                                              */
+/* ----------------------------------------------------------------------- */
+
+/* Set the AES key length to use and length of authentication tag to produce.
+/  Setting either to 0 requires the value be set at runtime via ae_init().
+/  Some optimizations occur for each when set to a fixed value.            */
+#define OCB_KEY_LEN         16  /* 0, 16, 24 or 32. 0 means set in ae_init */
+#define OCB_TAG_LEN         16  /* 0 to 16. 0 means set in ae_init         */
+
+/* This implementation has built-in support for multiple AES APIs. Set any
+/  one of the following to non-zero to specify which to use.               */
+#define USE_OPENSSL_AES      1  /* http://openssl.org                      */
+#define USE_REFERENCE_AES    0  /* Internet search: rijndael-alg-fst.c     */
+#define USE_AES_NI           0  /* Uses compiler's intrinsics              */
+
+/* During encryption and decryption, various "L values" are required.
+/  The L values can be precomputed during initialization (requiring extra
+/  space in ae_ctx), generated as needed (slightly slowing encryption and
+/  decryption), or some combination of the two. L_TABLE_SZ specifies how many
+/  L values to precomute. L_TABLE_SZ must be at least 3. L_TABLE_SZ*16 bytes
+/  are used for L values in ae_ctx. Plaintext and ciphertexts shorter than
+/  2^L_TABLE_SZ blocks need no L values calculated dynamically.            */
+#define L_TABLE_SZ          16
+
+/* Set L_TABLE_SZ_IS_ENOUGH non-zero iff you know that all plaintexts
+/  will be shorter than 2^(L_TABLE_SZ+4) bytes in length. This results
+/  in better performance.                                                  */
+#define L_TABLE_SZ_IS_ENOUGH 1
+
+/* ----------------------------------------------------------------------- */
+/* Includes and compiler specific definitions                              */
+/* ----------------------------------------------------------------------- */
+
+#include "ae.hpp"
+#include <stdlib.h>
+#include <string.h>
+
+/* Define standard sized integers                                          */
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+	typedef unsigned __int8  uint8_t;
+	typedef unsigned __int32 uint32_t;
+	typedef unsigned __int64 uint64_t;
+	typedef          __int64 int64_t;
+#else
+	#include <stdint.h>
+#endif
+
+/* Compiler-specific intrinsics and fixes: bswap64, ntz                    */
+#if _MSC_VER
+	#define inline __inline        /* MSVC doesn't recognize "inline" in C */
+	#define restrict __restrict  /* MSVC doesn't recognize "restrict" in C */
+    #define __SSE2__   (_M_IX86 || _M_AMD64 || _M_X64)    /* Assume SSE2  */
+    #define __SSSE3__  (_M_IX86 || _M_AMD64 || _M_X64)    /* Assume SSSE3 */
+	#include <intrin.h>
+	#pragma intrinsic(_byteswap_uint64, _BitScanForward, memcpy)
+	#define bswap64(x) _byteswap_uint64(x)
+	static inline unsigned ntz(unsigned x) {_BitScanForward(&x,x);return x;}
+#elif __GNUC__
+	#define inline __inline__            /* No "inline" in GCC ansi C mode */
+	#define restrict __restrict__      /* No "restrict" in GCC ansi C mode */
+	#define bswap64(x) __builtin_bswap64(x)           /* Assuming GCC 4.3+ */
+	#define ntz(x)     __builtin_ctz((unsigned)(x))   /* Assuming GCC 3.4+ */
+#else              /* Assume some C99 features: stdint.h, inline, restrict */
+	#define bswap32(x)                                              \
+	   ((((x) & 0xff000000u) >> 24) | (((x) & 0x00ff0000u) >>  8) | \
+		(((x) & 0x0000ff00u) <<  8) | (((x) & 0x000000ffu) << 24))
+
+	 static inline uint64_t bswap64(uint64_t x) {
+		union { uint64_t u64; uint32_t u32[2]; } in, out;
+		in.u64 = x;
+		out.u32[0] = bswap32(in.u32[1]);
+		out.u32[1] = bswap32(in.u32[0]);
+		return out.u64;
+	}
+    
+	#if (L_TABLE_SZ <= 9) && (L_TABLE_SZ_IS_ENOUGH)   /* < 2^13 byte texts */
+	static inline unsigned ntz(unsigned x) {
+		static const unsigned char tz_table[] = {0, 
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,7,
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,8,
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,7,
+		2,3,2,4,2,3,2,5,2,3,2,4,2,3,2,6,2,3,2,4,2,3,2,5,2,3,2,4,2,3,2};
+		return tz_table[x/4];
+	}
+	#else       /* From http://supertech.csail.mit.edu/papers/debruijn.pdf */
+	static inline unsigned ntz(unsigned x) {
+		static const unsigned char tz_table[32] = 
+		{ 0,  1, 28,  2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17,  4, 8, 
+		 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18,  6, 11,  5, 10, 9};
+		return tz_table[((uint32_t)((x & -x) * 0x077CB531u)) >> 27];
+	}
+	#endif
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Define blocks and operations -- Patch if incorrect on your compiler.    */
+/* ----------------------------------------------------------------------- */
+
+#if __SSE2__
+    #include <xmmintrin.h>              /* SSE instructions and _mm_malloc */
+    #include <emmintrin.h>              /* SSE2 instructions               */
+    typedef __m128i block;
+    #define xor_block(x,y)        _mm_xor_si128(x,y)
+    #define zero_block()          _mm_setzero_si128()
+    #define unequal_blocks(x,y) \
+    					   (_mm_movemask_epi8(_mm_cmpeq_epi8(x,y)) != 0xffff)
+	#if __SSSE3__ || USE_AES_NI
+    #include <tmmintrin.h>              /* SSSE3 instructions              */
+    #define swap_if_le(b) \
+      _mm_shuffle_epi8(b,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15))
+	#else
+    static inline block swap_if_le(block b) {
+		block a = _mm_shuffle_epi32  (b, _MM_SHUFFLE(0,1,2,3));
+		a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2,3,0,1));
+		a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2,3,0,1));
+		return _mm_xor_si128(_mm_srli_epi16(a,8), _mm_slli_epi16(a,8));
+    }
+	#endif
+	static inline block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		block hi = _mm_load_si128((__m128i *)(KtopStr+0));   /* hi = B A */
+		block lo = _mm_loadu_si128((__m128i *)(KtopStr+1));  /* lo = C B */
+		__m128i lshift = _mm_cvtsi32_si128(bot);
+		__m128i rshift = _mm_cvtsi32_si128(64-bot);
+		lo = _mm_xor_si128(_mm_sll_epi64(hi,lshift),_mm_srl_epi64(lo,rshift));
+		#if __SSSE3__ || USE_AES_NI
+		return _mm_shuffle_epi8(lo,_mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7));
+		#else
+		return swap_if_le(_mm_shuffle_epi32(lo, _MM_SHUFFLE(1,0,3,2)));
+		#endif
+	}
+	static inline block double_block(block bl) {
+		const __m128i mask = _mm_set_epi32(135,1,1,1);
+		__m128i tmp = _mm_srai_epi32(bl, 31);
+		tmp = _mm_and_si128(tmp, mask);
+		tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3));
+		bl = _mm_slli_epi32(bl, 1);
+		return _mm_xor_si128(bl,tmp);
+	}
+#elif __ALTIVEC__
+    #include <altivec.h>
+    typedef vector unsigned block;
+    #define xor_block(x,y)         vec_xor(x,y)
+    #define zero_block()           vec_splat_u32(0)
+    #define unequal_blocks(x,y)    vec_any_ne(x,y)
+    #define swap_if_le(b)          (b)
+	#if __PPC64__
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		union {uint64_t u64[2]; block bl;} rval;
+		rval.u64[0] = (KtopStr[0] << bot) | (KtopStr[1] >> (64-bot));
+		rval.u64[1] = (KtopStr[1] << bot) | (KtopStr[2] >> (64-bot));
+        return rval.bl;
+	}
+	#else
+	/* Special handling: Shifts are mod 32, and no 64-bit types */
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		const vector unsigned k32 = {32,32,32,32};
+		vector unsigned hi = *(vector unsigned *)(KtopStr+0);
+		vector unsigned lo = *(vector unsigned *)(KtopStr+2);
+		vector unsigned bot_vec;
+		if (bot < 32) {
+			lo = vec_sld(hi,lo,4);
+		} else {
+			vector unsigned t = vec_sld(hi,lo,4);
+			lo = vec_sld(hi,lo,8);
+			hi = t;
+			bot = bot - 32;
+		}
+		if (bot == 0) return hi;
+		*(unsigned *)&bot_vec = bot;
+		vector unsigned lshift = vec_splat(bot_vec,0);
+		vector unsigned rshift = vec_sub(k32,lshift);
+		hi = vec_sl(hi,lshift);
+		lo = vec_sr(lo,rshift);
+		return vec_xor(hi,lo);
+	}
+	#endif
+	static inline block double_block(block b) {
+		const vector unsigned char mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+		const vector unsigned char perm = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0};
+		const vector unsigned char shift7  = vec_splat_u8(7);
+		const vector unsigned char shift1  = vec_splat_u8(1);
+		vector unsigned char c = (vector unsigned char)b;
+		vector unsigned char t = vec_sra(c,shift7);
+		t = vec_and(t,mask);
+		t = vec_perm(t,t,perm);
+		c = vec_sl(c,shift1);
+		return (block)vec_xor(c,t);
+	}
+#elif __ARM_NEON__
+    #include <arm_neon.h>
+    typedef int8x16_t block;      /* Yay! Endian-neutral reads! */
+    #define xor_block(x,y)             veorq_s8(x,y)
+    #define zero_block()               vdupq_n_s8(0)
+    static inline int unequal_blocks(block a, block b) {
+		int64x2_t t=veorq_s64((int64x2_t)a,(int64x2_t)b);
+		return (vgetq_lane_s64(t,0)|vgetq_lane_s64(t,1))!=0;
+    }
+    #define swap_if_le(b)          (b)  /* Using endian-neutral int8x16_t */
+	/* KtopStr is reg correct by 64 bits, return mem correct */
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+		const union { unsigned x; unsigned char endian; } little = { 1 };
+		const int64x2_t k64 = {-64,-64};
+		uint64x2_t hi = *(uint64x2_t *)(KtopStr+0);   /* hi = A B */
+		uint64x2_t lo = *(uint64x2_t *)(KtopStr+1);   /* hi = B C */
+		int64x2_t ls = vdupq_n_s64(bot);
+		int64x2_t rs = vqaddq_s64(k64,ls);
+		block rval = (block)veorq_u64(vshlq_u64(hi,ls),vshlq_u64(lo,rs));
+		if (little.endian)
+			rval = vrev64q_s8(rval);
+		return rval;
+	}
+	static inline block double_block(block b)
+	{
+		const block mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+		block tmp = vshrq_n_s8(b,7);
+		tmp = vandq_s8(tmp, mask);
+		tmp = vextq_s8(tmp, tmp, 1);  /* Rotate high byte to end */
+		b = vshlq_n_s8(b,1);
+		return veorq_s8(tmp,b);
+	}
+#else
+    typedef struct { uint64_t l,r; } block;
+    static inline block xor_block(block x, block y) {
+    	x.l^=y.l; x.r^=y.r; return x;
+    }
+    static inline block zero_block(void) { const block t = {0,0}; return t; }
+    #define unequal_blocks(x, y)         ((((x).l^(y).l)|((x).r^(y).r)) != 0)
+    static inline block swap_if_le(block b) {
+		const union { unsigned x; unsigned char endian; } little = { 1 };
+    	if (little.endian) {
+    		block r;
+    		r.l = bswap64(b.l);
+    		r.r = bswap64(b.r);
+    		return r;
+    	} else
+    		return b;
+    }
+	
+	/* KtopStr is reg correct by 64 bits, return mem correct */
+	block gen_offset(uint64_t KtopStr[3], unsigned bot) {
+        block rval;
+        if (bot != 0) {
+			rval.l = (KtopStr[0] << bot) | (KtopStr[1] >> (64-bot));
+			rval.r = (KtopStr[1] << bot) | (KtopStr[2] >> (64-bot));
+		} else {
+			rval.l = KtopStr[0];
+			rval.r = KtopStr[1];
+		}
+        return swap_if_le(rval);
+	}
+
+	#if __GNUC__ && __arm__
+	static inline block double_block(block b) {
+		__asm__ ("adds %1,%1,%1\n\t"
+				 "adcs %H1,%H1,%H1\n\t"
+				 "adcs %0,%0,%0\n\t"
+				 "adcs %H0,%H0,%H0\n\t"
+				 "eorcs %1,%1,#135"
+		: "+r"(b.l), "+r"(b.r) : : "cc");
+		return b;
+	}
+	#else
+	static inline block double_block(block b) {
+		uint64_t t = (uint64_t)((int64_t)b.l >> 63);
+		b.l = (b.l + b.l) ^ (b.r >> 63);
+		b.r = (b.r + b.r) ^ (t & 135);
+		return b;
+	}
+	#endif
+    
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* AES - Code uses OpenSSL API. Other implementations get mapped to it.    */
+/* ----------------------------------------------------------------------- */
+
+/*---------------*/
+#if USE_OPENSSL_AES
+/*---------------*/
+
+#include <openssl/aes.h>                            /* http://openssl.org/ */
+
+/* How to ECB encrypt an array of blocks, in place                         */
+static inline void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_encrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+static inline void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_decrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+#define BPI 4  /* Number of blocks in buffer per ECB call */
+
+/*-------------------*/
+#elif USE_REFERENCE_AES
+/*-------------------*/
+
+#include "rijndael-alg-fst.h"              /* Barreto's Public-Domain Code */
+#if (OCB_KEY_LEN == 0)
+	typedef struct { uint32_t rd_key[60]; int rounds; } AES_KEY;
+	#define ROUNDS(ctx) ((ctx)->rounds)
+	#define AES_set_encrypt_key(x, y, z) \
+	 do {rijndaelKeySetupEnc((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)
+	#define AES_set_decrypt_key(x, y, z) \
+	 do {rijndaelKeySetupDec((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)
+#else
+	typedef struct { uint32_t rd_key[OCB_KEY_LEN+28]; } AES_KEY;
+	#define ROUNDS(ctx) (6+OCB_KEY_LEN/4)
+	#define AES_set_encrypt_key(x, y, z) rijndaelKeySetupEnc((z)->rd_key, x, y)
+	#define AES_set_decrypt_key(x, y, z) rijndaelKeySetupDec((z)->rd_key, x, y)
+#endif
+#define AES_encrypt(x,y,z) rijndaelEncrypt((z)->rd_key, ROUNDS(z), x, y)
+#define AES_decrypt(x,y,z) rijndaelDecrypt((z)->rd_key, ROUNDS(z), x, y)
+
+static void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_encrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+ void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+	while (nblks) {
+		--nblks;
+		AES_decrypt((unsigned char *)(blks+nblks), (unsigned char *)(blks+nblks), key);
+	}
+}
+
+#define BPI 4  /* Number of blocks in buffer per ECB call */
+
+/*----------*/
+#elif USE_AES_NI
+/*----------*/
+
+#include <wmmintrin.h>
+
+#if (OCB_KEY_LEN == 0)
+	typedef struct { __m128i rd_key[15]; int rounds; } AES_KEY;
+	#define ROUNDS(ctx) ((ctx)->rounds)
+#else
+	typedef struct { __m128i rd_key[7+OCB_KEY_LEN/4]; } AES_KEY;
+	#define ROUNDS(ctx) (6+OCB_KEY_LEN/4)
+#endif
+
+#define EXPAND_ASSIST(v1,v2,v3,v4,shuff_const,aes_const)                    \
+    v2 = _mm_aeskeygenassist_si128(v4,aes_const);                           \
+    v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3),              \
+                                         _mm_castsi128_ps(v1), 16));        \
+    v1 = _mm_xor_si128(v1,v3);                                              \
+    v3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v3),              \
+                                         _mm_castsi128_ps(v1), 140));       \
+    v1 = _mm_xor_si128(v1,v3);                                              \
+    v2 = _mm_shuffle_epi32(v2,shuff_const);                                 \
+    v1 = _mm_xor_si128(v1,v2)
+
+#define EXPAND192_STEP(idx,aes_const)                                       \
+    EXPAND_ASSIST(x0,x1,x2,x3,85,aes_const);                                \
+    x3 = _mm_xor_si128(x3,_mm_slli_si128 (x3, 4));                          \
+    x3 = _mm_xor_si128(x3,_mm_shuffle_epi32(x0, 255));                      \
+    kp[idx] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp),        \
+                                              _mm_castsi128_ps(x0), 68));   \
+    kp[idx+1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0),       \
+                                                _mm_castsi128_ps(x3), 78)); \
+    EXPAND_ASSIST(x0,x1,x2,x3,85,(aes_const*2));                            \
+    x3 = _mm_xor_si128(x3,_mm_slli_si128 (x3, 4));                          \
+    x3 = _mm_xor_si128(x3,_mm_shuffle_epi32(x0, 255));                      \
+    kp[idx+2] = x0; tmp = x3
+
+void AES_128_Key_Expansion(const unsigned char *userkey, void *key)
+{
+    __m128i x0,x1,x2;
+    __m128i *kp = (__m128i *)key;
+    kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey);
+    x2 = _mm_setzero_si128();
+    EXPAND_ASSIST(x0,x1,x2,x0,255,1);   kp[1]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,2);   kp[2]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,4);   kp[3]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,8);   kp[4]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,16);  kp[5]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,32);  kp[6]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,64);  kp[7]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,128); kp[8]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,27);  kp[9]  = x0;
+    EXPAND_ASSIST(x0,x1,x2,x0,255,54);  kp[10] = x0;
+}
+
+void AES_192_Key_Expansion(const unsigned char *userkey, void *key)
+{
+    __m128i x0,x1,x2,x3,tmp,*kp = (__m128i *)key;
+    kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey);
+    tmp = x3 = _mm_loadu_si128((__m128i*)(userkey+16));
+    x2 = _mm_setzero_si128();
+    EXPAND192_STEP(1,1);
+    EXPAND192_STEP(4,4);
+    EXPAND192_STEP(7,16);
+    EXPAND192_STEP(10,64);
+}
+
+void AES_256_Key_Expansion(const unsigned char *userkey, void *key)
+{
+    __m128i x0,x1,x2,x3,*kp = (__m128i *)key;
+    kp[0] = x0 = _mm_loadu_si128((__m128i*)userkey   );
+    kp[1] = x3 = _mm_loadu_si128((__m128i*)(userkey+16));
+    x2 = _mm_setzero_si128();
+    EXPAND_ASSIST(x0,x1,x2,x3,255,1);  kp[2]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,1);  kp[3]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,2);  kp[4]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,2);  kp[5]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,4);  kp[6]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,4);  kp[7]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,8);  kp[8]  = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,8);  kp[9]  = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,16); kp[10] = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,16); kp[11] = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,32); kp[12] = x0;
+    EXPAND_ASSIST(x3,x1,x2,x0,170,32); kp[13] = x3;
+    EXPAND_ASSIST(x0,x1,x2,x3,255,64); kp[14] = x0;
+}
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
+{
+    if (bits == 128) {
+        AES_128_Key_Expansion (userKey,key);
+    } else if (bits == 192) {
+        AES_192_Key_Expansion (userKey,key);
+    } else if (bits == 256) {
+        AES_256_Key_Expansion (userKey,key);
+    }
+    #if (OCB_KEY_LEN == 0)
+    	key->rounds = 6+bits/32;
+    #endif
+    return 0;
+}
+
+ void AES_set_decrypt_key_fast(AES_KEY *dkey, const AES_KEY *ekey)
+{
+    int j = 0;
+    int i = ROUNDS(ekey);
+    #if (OCB_KEY_LEN == 0)
+    	dkey->rounds = i;
+    #endif
+    dkey->rd_key[i--] = ekey->rd_key[j++];
+    while (i)
+        dkey->rd_key[i--] = _mm_aesimc_si128(ekey->rd_key[j++]);
+    dkey->rd_key[i] = ekey->rd_key[j];
+}
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key)
+{
+    AES_KEY temp_key;
+    AES_set_encrypt_key(userKey,bits,&temp_key);
+    AES_set_decrypt_key_fast(key, &temp_key);
+    return 0;
+}
+
+static inline void AES_encrypt(const unsigned char *in,
+                        unsigned char *out, const AES_KEY *key)
+{
+	int j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	__m128i tmp = _mm_load_si128 ((__m128i*)in);
+	tmp = _mm_xor_si128 (tmp,sched[0]);
+	for (j=1; j<rnds; j++)  tmp = _mm_aesenc_si128 (tmp,sched[j]);
+	tmp = _mm_aesenclast_si128 (tmp,sched[j]);
+	_mm_store_si128 ((__m128i*)out,tmp);
+}
+
+static inline void AES_decrypt(const unsigned char *in,
+                        unsigned char *out, const AES_KEY *key)
+{
+	int j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	__m128i tmp = _mm_load_si128 ((__m128i*)in);
+	tmp = _mm_xor_si128 (tmp,sched[0]);
+	for (j=1; j<rnds; j++)  tmp = _mm_aesdec_si128 (tmp,sched[j]);
+	tmp = _mm_aesdeclast_si128 (tmp,sched[j]);
+	_mm_store_si128 ((__m128i*)out,tmp);
+}
+
+static inline void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+    unsigned i,j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_xor_si128(blks[i], sched[0]);
+	for(j=1; j<rnds; ++j)
+	    for (i=0; i<nblks; ++i)
+		    blks[i] = _mm_aesenc_si128(blks[i], sched[j]);
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_aesenclast_si128(blks[i], sched[j]);
+}
+
+static inline void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
+    unsigned i,j,rnds=ROUNDS(key);
+	const __m128i *sched = ((__m128i *)(key->rd_key));
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_xor_si128(blks[i], sched[0]);
+	for(j=1; j<rnds; ++j)
+	    for (i=0; i<nblks; ++i)
+		    blks[i] = _mm_aesdec_si128(blks[i], sched[j]);
+	for (i=0; i<nblks; ++i)
+	    blks[i] =_mm_aesdeclast_si128(blks[i], sched[j]);
+}
+
+#define BPI 8  /* Number of blocks in buffer per ECB call   */
+               /* Set to 4 for Westmere, 8 for Sandy Bridge */
+
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Define OCB context structure.                                           */
+/* ----------------------------------------------------------------------- */
+
+/*------------------------------------------------------------------------
+/ Each item in the OCB context is stored either "memory correct" or
+/ "register correct". On big-endian machines, this is identical. On
+/ little-endian machines, one must choose whether the byte-string
+/ is in the corrct order when it resides in memory or in registers.
+/ It must be register correct whenever it is to be manipulated
+/ arithmetically, but must be memory correct whenever it interacts
+/ with the plaintext or ciphertext.
+/------------------------------------------------------------------------- */
+ 
+struct _ae_ctx {
+    block offset;                          /* Memory correct               */
+    block checksum;                        /* Memory correct               */
+    block Lstar;                           /* Memory correct               */
+    block Ldollar;                         /* Memory correct               */
+    block L[L_TABLE_SZ];                   /* Memory correct               */
+    block ad_checksum;                     /* Memory correct               */
+    block ad_offset;                       /* Memory correct               */
+    block cached_Top;                      /* Memory correct               */
+	uint64_t KtopStr[3];                   /* Register correct, each item  */
+    uint32_t ad_blocks_processed;
+    uint32_t blocks_processed;
+    AES_KEY decrypt_key;
+    AES_KEY encrypt_key;
+    #if (OCB_TAG_LEN == 0)
+    unsigned tag_len;
+    #endif
+};
+
+/* ----------------------------------------------------------------------- */
+/* L table lookup (or on-the-fly generation)                               */
+/* ----------------------------------------------------------------------- */
+
+#if L_TABLE_SZ_IS_ENOUGH
+#define getL(_ctx, _tz) ((_ctx)->L[_tz])
+#else
+static block getL(const ae_ctx *ctx, unsigned tz)
+{
+    if (tz < L_TABLE_SZ)
+        return ctx->L[tz];
+    else {
+        unsigned i;
+        /* Bring L[MAX] into registers, make it register correct */
+        block rval = swap_if_le(ctx->L[L_TABLE_SZ-1]);
+        rval = double_block(rval);
+        for (i=L_TABLE_SZ; i < tz; i++)
+            rval = double_block(rval);
+        return swap_if_le(rval);             /* To memory correct */
+    }
+}
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Public functions                                                        */
+/* ----------------------------------------------------------------------- */
+
+/* 32-bit SSE2 and Altivec systems need to be forced to allocate memory
+   on 16-byte alignments. (I believe all major 64-bit systems do already.) */
+
+ae_ctx* ae_allocate(void *misc)
+{ 
+	void *p;
+	(void) misc;                     /* misc unused in this implementation */
+	#if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__)
+    	p = _mm_malloc(sizeof(ae_ctx),16); 
+	#elif (__ALTIVEC__ && !__PPC64__)
+		if (posix_memalign(&p,16,sizeof(ae_ctx)) != 0) p = NULL;
+	#else
+		p = malloc(sizeof(ae_ctx));
+	#endif
+	return (ae_ctx *)p;
+}
+
+void ae_free(ae_ctx *ctx)
+{
+	#if (__SSE2__ && !_M_X64 && !_M_AMD64 && !__amd64__)
+		_mm_free(ctx);
+	#else
+		free(ctx);
+	#endif
+}
+
+/* ----------------------------------------------------------------------- */
+
+int ae_clear (ae_ctx *ctx) /* Zero ae_ctx and undo initialization          */
+{
+	memset(ctx, 0, sizeof(ae_ctx));
+	return AE_SUCCESS;
+}
+
+int ae_ctx_sizeof(void) { return (int) sizeof(ae_ctx); }
+
+/* ----------------------------------------------------------------------- */
+
+int ae_init(ae_ctx *ctx, const void *key, int key_len, int nonce_len, int tag_len)
+{
+    unsigned i;
+    block tmp_blk;
+    
+    if (nonce_len != 12)
+    	return AE_NOT_SUPPORTED;
+    
+    /* Initialize encryption & decryption keys */
+    #if (OCB_KEY_LEN > 0)
+    key_len = OCB_KEY_LEN;
+    #endif
+    AES_set_encrypt_key((unsigned char *)key, key_len*8, &ctx->encrypt_key);
+    #if USE_AES_NI
+    AES_set_decrypt_key_fast(&ctx->decrypt_key,&ctx->encrypt_key);
+    #else
+    AES_set_decrypt_key((unsigned char *)key, (int)(key_len*8), &ctx->decrypt_key);
+    #endif
+    
+    /* Zero things that need zeroing */
+    ctx->cached_Top = ctx->ad_checksum = zero_block();
+    ctx->ad_blocks_processed = 0;
+    
+    /* Compute key-dependent values */
+    AES_encrypt((unsigned char *)&ctx->cached_Top,
+                            (unsigned char *)&ctx->Lstar, &ctx->encrypt_key);
+    tmp_blk = swap_if_le(ctx->Lstar);
+    tmp_blk = double_block(tmp_blk);
+    ctx->Ldollar = swap_if_le(tmp_blk);
+    tmp_blk = double_block(tmp_blk);
+    ctx->L[0] = swap_if_le(tmp_blk);
+    for (i = 1; i < L_TABLE_SZ; i++) {
+		tmp_blk = double_block(tmp_blk);
+    	ctx->L[i] = swap_if_le(tmp_blk);
+    }
+
+    #if (OCB_TAG_LEN == 0)
+    	ctx->tag_len = tag_len;
+    #else
+    	(void) tag_len;  /* Supress var not used error */
+    #endif
+
+    return AE_SUCCESS;
+}
+
+/* ----------------------------------------------------------------------- */
+
+static block gen_offset_from_nonce(ae_ctx *ctx, const void *nonce)
+{
+	const union { unsigned x; unsigned char endian; } little = { 1 };
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+	unsigned idx;
+
+	/* Replace cached nonce Top if needed */
+	tmp.u32[0] = (little.endian?0x01000000:0x00000001);
+	tmp.u32[1] = ((uint32_t *)nonce)[0];
+	tmp.u32[2] = ((uint32_t *)nonce)[1];
+	tmp.u32[3] = ((uint32_t *)nonce)[2];
+	idx = (unsigned)(tmp.u8[15] & 0x3f);   /* Get low 6 bits of nonce  */
+	tmp.u8[15] = tmp.u8[15] & 0xc0;        /* Zero low 6 bits of nonce */
+	if ( unequal_blocks(tmp.bl,ctx->cached_Top) )   { /* Cached?       */
+		ctx->cached_Top = tmp.bl;          /* Update cache, KtopStr    */
+		AES_encrypt(tmp.u8, (unsigned char *)&ctx->KtopStr, &ctx->encrypt_key);
+		if (little.endian) {               /* Make Register Correct    */
+			ctx->KtopStr[0] = bswap64(ctx->KtopStr[0]);
+			ctx->KtopStr[1] = bswap64(ctx->KtopStr[1]);
+		}
+		ctx->KtopStr[2] = ctx->KtopStr[0] ^
+						 (ctx->KtopStr[0] << 8) ^ (ctx->KtopStr[1] >> 56);
+	}
+	return gen_offset(ctx->KtopStr, idx);
+}
+
+ void process_ad(ae_ctx *ctx, const void *ad, int ad_len, int final)
+{
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+    block ad_offset, ad_checksum;
+    const block *  adp = (block *)ad;
+	unsigned i,k,tz,remaining;
+    
+    ad_offset = ctx->ad_offset;
+    ad_checksum = ctx->ad_checksum;
+    i = ad_len/(BPI*16);
+    if (i) {
+		unsigned ad_block_num = ctx->ad_blocks_processed;
+		do {
+			block ta[BPI], oa[BPI];
+			ad_block_num += BPI;
+			tz = ntz(ad_block_num);
+			oa[0] = xor_block(ad_offset, ctx->L[0]);
+			ta[0] = xor_block(oa[0], adp[0]);
+			oa[1] = xor_block(oa[0], ctx->L[1]);
+			ta[1] = xor_block(oa[1], adp[1]);
+			oa[2] = xor_block(ad_offset, ctx->L[1]);
+			ta[2] = xor_block(oa[2], adp[2]);
+			#if BPI == 4
+				ad_offset = xor_block(oa[2], getL(ctx, tz));
+				ta[3] = xor_block(ad_offset, adp[3]);
+			#elif BPI == 8
+				oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(oa[3], adp[3]);
+				oa[4] = xor_block(oa[1], ctx->L[2]);
+				ta[4] = xor_block(oa[4], adp[4]);
+				oa[5] = xor_block(oa[0], ctx->L[2]);
+				ta[5] = xor_block(oa[5], adp[5]);
+				oa[6] = xor_block(ad_offset, ctx->L[2]);
+				ta[6] = xor_block(oa[6], adp[6]);
+				ad_offset = xor_block(oa[6], getL(ctx, tz));
+				ta[7] = xor_block(ad_offset, adp[7]);
+			#endif
+			AES_ecb_encrypt_blks(ta,BPI,&ctx->encrypt_key);
+			ad_checksum = xor_block(ad_checksum, ta[0]);
+			ad_checksum = xor_block(ad_checksum, ta[1]);
+			ad_checksum = xor_block(ad_checksum, ta[2]);
+			ad_checksum = xor_block(ad_checksum, ta[3]);
+			#if (BPI == 8)
+			ad_checksum = xor_block(ad_checksum, ta[4]);
+			ad_checksum = xor_block(ad_checksum, ta[5]);
+			ad_checksum = xor_block(ad_checksum, ta[6]);
+			ad_checksum = xor_block(ad_checksum, ta[7]);
+			#endif
+			adp += BPI;
+		} while (--i);
+		ctx->ad_blocks_processed = ad_block_num;
+		ctx->ad_offset = ad_offset;
+		ctx->ad_checksum = ad_checksum;
+	}
+
+    if (final) {
+		block ta[BPI];
+		
+        /* Process remaining associated data, compute its tag contribution */
+        remaining = ((unsigned)ad_len) % (BPI*16);
+        if (remaining) {
+			k=0;
+			#if (BPI == 8)
+			if (remaining >= 64) {
+				tmp.bl = xor_block(ad_offset, ctx->L[0]);
+				ta[0] = xor_block(tmp.bl, adp[0]);
+				tmp.bl = xor_block(tmp.bl, ctx->L[1]);
+				ta[1] = xor_block(tmp.bl, adp[1]);
+				ad_offset = xor_block(ad_offset, ctx->L[1]);
+				ta[2] = xor_block(ad_offset, adp[2]);
+				ad_offset = xor_block(ad_offset, ctx->L[2]);
+				ta[3] = xor_block(ad_offset, adp[3]);
+				remaining -= 64;
+				k=4;
+			}
+			#endif
+			if (remaining >= 32) {
+				ad_offset = xor_block(ad_offset, ctx->L[0]);
+				ta[k] = xor_block(ad_offset, adp[k]);
+				ad_offset = xor_block(ad_offset, getL(ctx, ntz(k+2)));
+				ta[k+1] = xor_block(ad_offset, adp[k+1]);
+				remaining -= 32;
+				k+=2;
+			}
+			if (remaining >= 16) {
+				ad_offset = xor_block(ad_offset, ctx->L[0]);
+				ta[k] = xor_block(ad_offset, adp[k]);
+				remaining = remaining - 16;
+				++k;
+			}
+			if (remaining) {
+				ad_offset = xor_block(ad_offset,ctx->Lstar);
+				tmp.bl = zero_block();
+				memcpy(tmp.u8, adp+k, remaining);
+				tmp.u8[remaining] = (unsigned char)0x80u;
+				ta[k] = xor_block(ad_offset, tmp.bl);
+				++k;
+			}
+			AES_ecb_encrypt_blks(ta,k,&ctx->encrypt_key);
+			switch (k) {
+				#if (BPI == 8)
+				case 8: ad_checksum = xor_block(ad_checksum, ta[7]);
+				case 7: ad_checksum = xor_block(ad_checksum, ta[6]);
+				case 6: ad_checksum = xor_block(ad_checksum, ta[5]);
+				case 5: ad_checksum = xor_block(ad_checksum, ta[4]);
+				#endif
+				case 4: ad_checksum = xor_block(ad_checksum, ta[3]);
+				case 3: ad_checksum = xor_block(ad_checksum, ta[2]);
+				case 2: ad_checksum = xor_block(ad_checksum, ta[1]);
+				case 1: ad_checksum = xor_block(ad_checksum, ta[0]);
+			}
+			ctx->ad_checksum = ad_checksum;
+		}
+	}
+}
+
+/* ----------------------------------------------------------------------- */
+
+int ae_encrypt(ae_ctx     *  ctx,
+               const void *  nonce,
+               const void *pt,
+               int         pt_len,
+               const void *ad,
+               int         ad_len,
+               void       *ct,
+               void       *tag,
+               int         final)
+{
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+    block offset, checksum;
+    unsigned i, k;
+    block       * ctp = (block *)ct;
+    const block * ptp = (block *)pt;
+
+    /* Non-null nonce means start of new message, init per-message values */
+    if (nonce) {
+        ctx->offset = gen_offset_from_nonce(ctx, nonce);
+        ctx->ad_offset = ctx->checksum   = zero_block();
+        ctx->ad_blocks_processed = ctx->blocks_processed    = 0;
+        if (ad_len >= 0)
+        	ctx->ad_checksum = zero_block();
+    }
+
+	/* Process associated data */
+	if (ad_len > 0)
+		process_ad(ctx, ad, ad_len, final);
+
+	/* Encrypt plaintext data BPI blocks at a time */
+    offset = ctx->offset;
+    checksum  = ctx->checksum;
+    i = pt_len/(BPI*16);
+    if (i) {
+    	block oa[BPI];
+    	unsigned block_num = ctx->blocks_processed;
+    	oa[BPI-1] = offset;
+		do {
+			block ta[BPI];
+			block_num += BPI;
+			oa[0] = xor_block(oa[BPI-1], ctx->L[0]);
+			ta[0] = xor_block(oa[0], ptp[0]);
+			checksum = xor_block(checksum, ptp[0]);
+			oa[1] = xor_block(oa[0], ctx->L[1]);
+			ta[1] = xor_block(oa[1], ptp[1]);
+			checksum = xor_block(checksum, ptp[1]);
+			oa[2] = xor_block(oa[1], ctx->L[0]);
+			ta[2] = xor_block(oa[2], ptp[2]);
+			checksum = xor_block(checksum, ptp[2]);
+			#if BPI == 4
+				oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num)));
+				ta[3] = xor_block(oa[3], ptp[3]);
+				checksum = xor_block(checksum, ptp[3]);
+			#elif BPI == 8
+				oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(oa[3], ptp[3]);
+				checksum = xor_block(checksum, ptp[3]);
+				oa[4] = xor_block(oa[1], ctx->L[2]);
+				ta[4] = xor_block(oa[4], ptp[4]);
+				checksum = xor_block(checksum, ptp[4]);
+				oa[5] = xor_block(oa[0], ctx->L[2]);
+				ta[5] = xor_block(oa[5], ptp[5]);
+				checksum = xor_block(checksum, ptp[5]);
+				oa[6] = xor_block(oa[7], ctx->L[2]);
+				ta[6] = xor_block(oa[6], ptp[6]);
+				checksum = xor_block(checksum, ptp[6]);
+				oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num)));
+				ta[7] = xor_block(oa[7], ptp[7]);
+				checksum = xor_block(checksum, ptp[7]);
+			#endif
+			AES_ecb_encrypt_blks(ta,BPI,&ctx->encrypt_key);
+			ctp[0] = xor_block(ta[0], oa[0]);
+			ctp[1] = xor_block(ta[1], oa[1]);
+			ctp[2] = xor_block(ta[2], oa[2]);
+			ctp[3] = xor_block(ta[3], oa[3]);
+			#if (BPI == 8)
+			ctp[4] = xor_block(ta[4], oa[4]);
+			ctp[5] = xor_block(ta[5], oa[5]);
+			ctp[6] = xor_block(ta[6], oa[6]);
+			ctp[7] = xor_block(ta[7], oa[7]);
+			#endif
+			ptp += BPI;
+			ctp += BPI;
+		} while (--i);
+    	ctx->offset = offset = oa[BPI-1];
+	    ctx->blocks_processed = block_num;
+		ctx->checksum = checksum;
+    }
+    
+    if (final) {
+		block ta[BPI+1], oa[BPI];
+				
+        /* Process remaining plaintext and compute its tag contribution    */
+        unsigned remaining = ((unsigned)pt_len) % (BPI*16);
+        k = 0;                      /* How many blocks in ta[] need ECBing */
+        if (remaining) {
+			#if (BPI == 8)
+			if (remaining >= 64) {
+				oa[0] = xor_block(offset, ctx->L[0]);
+				ta[0] = xor_block(oa[0], ptp[0]);
+				checksum = xor_block(checksum, ptp[0]);
+				oa[1] = xor_block(oa[0], ctx->L[1]);
+				ta[1] = xor_block(oa[1], ptp[1]);
+				checksum = xor_block(checksum, ptp[1]);
+				oa[2] = xor_block(oa[1], ctx->L[0]);
+				ta[2] = xor_block(oa[2], ptp[2]);
+				checksum = xor_block(checksum, ptp[2]);
+				offset = oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(offset, ptp[3]);
+				checksum = xor_block(checksum, ptp[3]);
+				remaining -= 64;
+				k = 4;
+			}
+			#endif
+			if (remaining >= 32) {
+				oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(oa[k], ptp[k]);
+				checksum = xor_block(checksum, ptp[k]);
+				offset = oa[k+1] = xor_block(oa[k], ctx->L[1]);
+				ta[k+1] = xor_block(offset, ptp[k+1]);
+				checksum = xor_block(checksum, ptp[k+1]);
+				remaining -= 32;
+				k+=2;
+			}
+			if (remaining >= 16) {
+				offset = oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(offset, ptp[k]);
+				checksum = xor_block(checksum, ptp[k]);
+				remaining -= 16;
+				++k;
+			}
+			if (remaining) {
+				tmp.bl = zero_block();
+				memcpy(tmp.u8, ptp+k, remaining);
+				tmp.u8[remaining] = (unsigned char)0x80u;
+				checksum = xor_block(checksum, tmp.bl);
+				ta[k] = offset = xor_block(offset,ctx->Lstar);
+				++k;
+			}
+		}
+        offset = xor_block(offset, ctx->Ldollar);      /* Part of tag gen */
+        ta[k] = xor_block(offset, checksum);           /* Part of tag gen */
+		AES_ecb_encrypt_blks(ta,k+1,&ctx->encrypt_key);
+		offset = xor_block(ta[k], ctx->ad_checksum);   /* Part of tag gen */
+		if (remaining) {
+			--k;
+			tmp.bl = xor_block(tmp.bl, ta[k]);
+			memcpy(ctp+k, tmp.u8, remaining);
+		}
+		switch (k) {
+			#if (BPI == 8)
+			case 7: ctp[6] = xor_block(ta[6], oa[6]);
+			case 6: ctp[5] = xor_block(ta[5], oa[5]);
+			case 5: ctp[4] = xor_block(ta[4], oa[4]);
+			case 4: ctp[3] = xor_block(ta[3], oa[3]);
+			#endif
+			case 3: ctp[2] = xor_block(ta[2], oa[2]);
+			case 2: ctp[1] = xor_block(ta[1], oa[1]);
+			case 1: ctp[0] = xor_block(ta[0], oa[0]);
+		}
+        
+        /* Tag is placed at the correct location
+         */
+        if (tag) {
+			#if (OCB_TAG_LEN == 16)
+            	*(block *)tag = offset;
+			#elif (OCB_TAG_LEN > 0)
+	            memcpy((char *)tag, &offset, OCB_TAG_LEN);
+			#else
+	            memcpy((char *)tag, &offset, ctx->tag_len);
+	        #endif
+        } else {
+			#if (OCB_TAG_LEN > 0)
+	            memcpy((char *)ct + pt_len, &offset, OCB_TAG_LEN);
+            	pt_len += OCB_TAG_LEN;
+			#else
+	            memcpy((char *)ct + pt_len, &offset, ctx->tag_len);
+            	pt_len += ctx->tag_len;
+	        #endif
+        }
+    }
+    return (int) pt_len;
+}
+
+/* ----------------------------------------------------------------------- */
+
+int ae_decrypt(ae_ctx     *ctx,
+               const void *nonce,
+               const void *ct,
+               int         ct_len,
+               const void *ad,
+               int         ad_len,
+               void       *pt,
+               const void *tag,
+               int         final)
+{
+	union { uint32_t u32[4]; uint8_t u8[16]; block bl; } tmp;
+    block offset, checksum;
+    unsigned i, k;
+    block       *ctp = (block *)ct;
+    block       *ptp = (block *)pt;
+		
+	/* Reduce ct_len tag bundled in ct */
+	if ((final) && (!tag))
+		#if (OCB_TAG_LEN > 0)
+			ct_len -= OCB_TAG_LEN;
+		#else
+			ct_len -= ctx->tag_len;
+		#endif
+
+    /* Non-null nonce means start of new message, init per-message values */
+    if (nonce) {
+        ctx->offset = gen_offset_from_nonce(ctx, nonce);
+        ctx->ad_offset = ctx->checksum   = zero_block();
+        ctx->ad_blocks_processed = ctx->blocks_processed    = 0;
+        if (ad_len >= 0)
+        	ctx->ad_checksum = zero_block();
+    }
+
+	/* Process associated data */
+	if (ad_len > 0)
+		process_ad(ctx, ad, ad_len, final);
+
+	/* Encrypt plaintext data BPI blocks at a time */
+    offset = ctx->offset;
+    checksum  = ctx->checksum;
+    i = ct_len/(BPI*16);
+    if (i) {
+    	block oa[BPI];
+    	unsigned block_num = ctx->blocks_processed;
+    	oa[BPI-1] = offset;
+		do {
+			block ta[BPI];
+			block_num += BPI;
+			oa[0] = xor_block(oa[BPI-1], ctx->L[0]);
+			ta[0] = xor_block(oa[0], ctp[0]);
+			oa[1] = xor_block(oa[0], ctx->L[1]);
+			ta[1] = xor_block(oa[1], ctp[1]);
+			oa[2] = xor_block(oa[1], ctx->L[0]);
+			ta[2] = xor_block(oa[2], ctp[2]);
+			#if BPI == 4
+				oa[3] = xor_block(oa[2], getL(ctx, ntz(block_num)));
+				ta[3] = xor_block(oa[3], ctp[3]);
+			#elif BPI == 8
+				oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(oa[3], ctp[3]);
+				oa[4] = xor_block(oa[1], ctx->L[2]);
+				ta[4] = xor_block(oa[4], ctp[4]);
+				oa[5] = xor_block(oa[0], ctx->L[2]);
+				ta[5] = xor_block(oa[5], ctp[5]);
+				oa[6] = xor_block(oa[7], ctx->L[2]);
+				ta[6] = xor_block(oa[6], ctp[6]);
+				oa[7] = xor_block(oa[6], getL(ctx, ntz(block_num)));
+				ta[7] = xor_block(oa[7], ctp[7]);
+			#endif
+			AES_ecb_decrypt_blks(ta,BPI,&ctx->decrypt_key);
+			ptp[0] = xor_block(ta[0], oa[0]);
+			checksum = xor_block(checksum, ptp[0]);
+			ptp[1] = xor_block(ta[1], oa[1]);
+			checksum = xor_block(checksum, ptp[1]);
+			ptp[2] = xor_block(ta[2], oa[2]);
+			checksum = xor_block(checksum, ptp[2]);
+			ptp[3] = xor_block(ta[3], oa[3]);
+			checksum = xor_block(checksum, ptp[3]);
+			#if (BPI == 8)
+			ptp[4] = xor_block(ta[4], oa[4]);
+			checksum = xor_block(checksum, ptp[4]);
+			ptp[5] = xor_block(ta[5], oa[5]);
+			checksum = xor_block(checksum, ptp[5]);
+			ptp[6] = xor_block(ta[6], oa[6]);
+			checksum = xor_block(checksum, ptp[6]);
+			ptp[7] = xor_block(ta[7], oa[7]);
+			checksum = xor_block(checksum, ptp[7]);
+			#endif
+			ptp += BPI;
+			ctp += BPI;
+		} while (--i);
+    	ctx->offset = offset = oa[BPI-1];
+	    ctx->blocks_processed = block_num;
+		ctx->checksum = checksum;
+    }
+    
+    if (final) {
+		block ta[BPI+1], oa[BPI];
+				
+        /* Process remaining plaintext and compute its tag contribution    */
+        unsigned remaining = ((unsigned)ct_len) % (BPI*16);
+        k = 0;                      /* How many blocks in ta[] need ECBing */
+        if (remaining) {
+			#if (BPI == 8)
+			if (remaining >= 64) {
+				oa[0] = xor_block(offset, ctx->L[0]);
+				ta[0] = xor_block(oa[0], ctp[0]);
+				oa[1] = xor_block(oa[0], ctx->L[1]);
+				ta[1] = xor_block(oa[1], ctp[1]);
+				oa[2] = xor_block(oa[1], ctx->L[0]);
+				ta[2] = xor_block(oa[2], ctp[2]);
+				offset = oa[3] = xor_block(oa[2], ctx->L[2]);
+				ta[3] = xor_block(offset, ctp[3]);
+				remaining -= 64;
+				k = 4;
+			}
+			#endif
+			if (remaining >= 32) {
+				oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(oa[k], ctp[k]);
+				offset = oa[k+1] = xor_block(oa[k], ctx->L[1]);
+				ta[k+1] = xor_block(offset, ctp[k+1]);
+				remaining -= 32;
+				k+=2;
+			}
+			if (remaining >= 16) {
+				offset = oa[k] = xor_block(offset, ctx->L[0]);
+				ta[k] = xor_block(offset, ctp[k]);
+				remaining -= 16;
+				++k;
+			}
+			if (remaining) {
+				block pad;
+				offset = xor_block(offset,ctx->Lstar);
+				AES_encrypt((unsigned char *)&offset, tmp.u8, &ctx->encrypt_key);
+				pad = tmp.bl;
+				memcpy(tmp.u8,ctp+k,remaining);
+				tmp.bl = xor_block(tmp.bl, pad);
+				tmp.u8[remaining] = (unsigned char)0x80u;
+				memcpy(ptp+k, tmp.u8, remaining);
+				checksum = xor_block(checksum, tmp.bl);
+			}
+		}
+		AES_ecb_decrypt_blks(ta,k,&ctx->decrypt_key);
+		switch (k) {
+			#if (BPI == 8)
+			case 7: ptp[6] = xor_block(ta[6], oa[6]);
+				    checksum = xor_block(checksum, ptp[6]);
+			case 6: ptp[5] = xor_block(ta[5], oa[5]);
+				    checksum = xor_block(checksum, ptp[5]);
+			case 5: ptp[4] = xor_block(ta[4], oa[4]);
+				    checksum = xor_block(checksum, ptp[4]);
+			case 4: ptp[3] = xor_block(ta[3], oa[3]);
+				    checksum = xor_block(checksum, ptp[3]);
+			#endif
+			case 3: ptp[2] = xor_block(ta[2], oa[2]);
+				    checksum = xor_block(checksum, ptp[2]);
+			case 2: ptp[1] = xor_block(ta[1], oa[1]);
+				    checksum = xor_block(checksum, ptp[1]);
+			case 1: ptp[0] = xor_block(ta[0], oa[0]);
+				    checksum = xor_block(checksum, ptp[0]);
+		}
+		
+		/* Calculate expected tag */
+        offset = xor_block(offset, ctx->Ldollar);
+        tmp.bl = xor_block(offset, checksum);
+		AES_encrypt(tmp.u8, tmp.u8, &ctx->encrypt_key);
+		tmp.bl = xor_block(tmp.bl, ctx->ad_checksum); /* Full tag */
+
+		/* Compare with proposed tag, change ct_len if invalid */
+		if ((OCB_TAG_LEN == 16) && tag) {
+			if (unequal_blocks(tmp.bl, *(block *)tag))
+				ct_len = AE_INVALID;
+		} else {
+			#if (OCB_TAG_LEN > 0)
+				int len = OCB_TAG_LEN;
+			#else
+				int len = ctx->tag_len;
+			#endif
+			if (tag) {
+				if (memcmp(tag,tmp.u8,len) != 0)
+					ct_len = AE_INVALID;
+			} else {
+				if (memcmp((char *)ct + ct_len,tmp.u8,len) != 0)
+					ct_len = AE_INVALID;
+			}
+		}
+    }
+    return ct_len;
+ }
+
+#if USE_AES_NI
+char infoString[] = "OCB (AES-NI)";
+#elif USE_REFERENCE_AES
+char infoString[] = "OCB (Reference AES)";
+#elif USE_OPENSSL_AES
+char infoString[] = "OCB (OpenSSL AES)";
+#endif