 1/draftietfidnamcacez00.txt 20061203 11:32:46.000000000 +0100
+++ 2/draftietfidnamcacez01.txt 20061203 11:32:46.000000000 +0100
@@ 1,15 +1,15 @@
INTERNETDRAFT Adam M. Costello
draftietfidnamcacez00.txt 2001Aug16
Expires 2002Feb16
+draftietfidnamcacez01.txt 2001Sep04
+Expires 2002Mar04
 AMCACEZ version 0.3.0
+ AMCACEZ version 0.3.1
Status of this Memo
This document is an InternetDraft and is in full conformance with
all provisions of Section 10 of RFC2026.
InternetDrafts are working documents of the Internet Engineering
Task Force (IETF), its areas, and its working groups. Note
that other groups may also distribute working documents as
InternetDrafts.
@@ 54,20 +54,21 @@
3.1 Basic code point segregation
3.2 Insertion unsort coding
3.3 Generalized variablelength integers
3.4 Bias adaptation
4. Bootstring parameters
5. Parameter values for AMCACEZ
6. Bootstring algorithms
6.1 Bias adaptation function
6.2 Decoding procedure
6.3 Encoding procedure
+ 6.4 Alternative methods for handling overflow
7. AMCACEZ example strings
8. Security considerations
9. References
A. Author contact information
B. Mixedcase annotation
C. AMCACEZ sample implementation
1. Introduction
The IDNA draft [IDNA] describes an architecture for supporting
@@ 112,37 +113,39 @@
AMCACEZ is a working name that should be changed if it is adopted.
(The Z merely indicates that it is the twentysixth ACE devised by
this author. Most were not worth releasing.)
2. Terminology
The key words "must", "shall", "required", "should", "recommended",
and "may" in this document are to be interpreted as described in RFC
2119 [RFC2119].
+ A "code point" is an integral value associated with a character in a
+ coded character set.
+
As in the Unicode Standard [UNICODE], Unicode code points are
denoted by "U+" followed by four to six hexadecimal digits, while a
range of code points is denoted by two hexadecimal numbers separated
by "..", with no prefixes.
The operators div and mod perform integer division; (x div y) is the
quotient of x divided by y, discarding the remainder, and (x mod y)
is the remainder, so (x div y) * y + (x mod y) == x. Bootstring
uses these operators only with nonnegative operands, so the quotient
and remainder are always nonnegative.
 The ?: operator is a conditional; (x ? y : z) means y if x is true,
 z if x is false. It is just like "if x then y else z" except that y
 and z are expressions rather than statements.

The "break" statement jumps out of the innermost loop (as in C).
+ An "overflow" is an attempt to compute a value that exceeds the
+ maximum value of an integer variable.
+
3. Bootstring description
Bootstring represents an arbitrary sequence of code points (the
"extended string") as a sequence of basic code points (the
"basic string"). This section describes the representation.
Section 6 "Bootstring algorithms" presents the algorithms as
pseudocode.
3.1 Basic code point segregation
@@ 245,24 +248,26 @@
integral value.
Bootstring uses littleendian ordering so that the deltas can be
separated starting with the first. The t(j) values are defined in
terms of the constants base, tmin, and tmax, and a state variable
called bias:
t(j) = base * (j + 1)  bias,
clamped to the range tmin through tmax
 (The clamping means that if the formula yields a value less than
 tmin or greater than tmax, then t(j) = tmin or tmax, respectively.)
 These t(j) values cause the representation to favor integers within
 a particular range determined by the bias.
+ The clamping means that if the formula yields a value less than tmin
+ or greater than tmax, then t(j) = tmin or tmax, respectively. (In
+ the pseudocode in section 6 "Bootstring algorithms", the expression
+ base * (j + 1) is denoted by k for performance reasons.) These
+ t(j) values cause the representation to favor integers within a
+ particular range determined by the bias.
3.4 Bias adaptation
After each delta is encoded or decoded, bias is set for the next
delta as follows:
1. Delta is scaled in order to avoid overflow in the next step:
let delta = delta div 2
@@ 298,25 +303,26 @@
expected to be last, tmin for the less significant digits up through
the one expected to be thirdlast, and somewhere between tmin and
tmax for the digit expected to be secondlast (balancing the hope of
the expectedlast digit being unnecessary against the danger of it
being insufficient).
4. Bootstring parameters
Given a set of basic code points, one must be designated as
the delimiter. The base can be no greater than the number of
 distinguishable basic code points remaining. The values 0 through
 base1 must be associated with nondelimiter basic code points.
 In some cases multiple code points must represent the same value;
 for example, uppercase and lowercase versions of a letter must be
 equivalent if basic strings are caseinsensitive.
+ distinguishable basic code points remaining. The digitvalues
+ in the range 0 through base1 must be associated with distinct
+ nondelimiter basic code points. In some cases multiple code points
+ must have the same digitvalue; for example, uppercase and lowercase
+ versions of the same letter must be equivalent if basic strings are
+ caseinsensitive.
The initial value of n must be no greater than the minimum nonbasic
code point that could appear in extended strings.
The remaining five parameters (tmin, tmax, skew, damp, and the
initial value of bias) must satisfy the following constraints:
0 <= tmin <= tmax <= base1
skew >= 1
damp >= 2
@@ 332,28 +338,30 @@
5. Parameter values for AMCACEZ
AMCACEZ uses the following Bootstring parameter values:
base = 36
tmin = 1
tmax = 26
skew = 38
damp = 700
initial_bias = 72
 initial_n = U+0080
+ initial_n = 0x80
In AMCACEZ, code points are Unicode code points [UNICODE], that
is, integers in the range 0..10FFFF, but not D800..DFFF, which are
reserved for use by UTF16. The basic code points are the ASCII
 code points (0..7F), some of which have values associated with them:
+ code points (0..7F), of which U+002D () is the delimiter, and some
+ of the others have digitvalues as follows:
 U+002D () = delimiter
+ code points digitvalues
+  
41..5A (AZ) = 0 to 25, respectively
61..7A (az) = 0 to 25, respectively
30..39 (09) = 26 to 35, respectively
Using hyphenminus as the delimiter implies that the ACE can end
with a hyphenminus only if the Unicode string consists entirely
of basic code points, but IDNA forbids such strings from being
ACEencoded. Furthermore, the ACE can begin with a hyphenminus
only if the Unicode string does, which is forbidden by IDNA.
Therefore IDNA using AMCACEZ, regardless of whether the signature
@@ 380,115 +388,135 @@
Such ambiguities are usually resolved by context, but in an ACE
there is no context apparent to humans.
6. Bootstring algorithms
Some parts of the pseudocode can be omitted if the parameters
satisfy certain conditions (for which AMCACEZ qualifies). These
parts are enclosed in {braces}, and notes immediately following the
pseudocode explain the conditions under which they may be omitted.
+ Formally, code points are integers, and hence the pseudocode assumes
+ that arithmetic operations can be performed directly on code points.
+ Some actual programming languages might require explicit conversion
+ between code points and integers.
+
6.1 Bias adaptation function
function adapt(delta,numpoints,firsttime):
 let delta = delta div (firsttime ? damp : 2)
+ if firsttime then let delta = delta div damp
+ else let delta = delta div 2
let delta = delta + (delta div numpoints)
let k = 0
 while delta > ((base  tmin) * tmax) div 2
 do let delta = delta div (base  tmin) and let k = k + base
+ while delta > ((base  tmin) * tmax) div 2 do begin
+ let delta = delta div (base  tmin)
+ let k = k + base
+ end
return k + (((base  tmin + 1) * delta) div (delta + skew))
+ It does not matter whether the modifications to delta and k
+ inside adapt() affect variables of the same name inside the
+ encoding/decoding procedures, because after calling adapt() the
+ caller does not read those variables before overwriting them.
+
6.2 Decoding procedure
let n = initial_n
let i = 0
let bias = initial_bias
let output = an empty string indexed from 0
consume all code points before the last delimiter (if there is one)
and copy them to output, fail on any nonbasic code point
if more than zero code points were consumed then consume one more
+ (which will be the last delimiter)
while the input is not exhausted do begin
let oldi = i
let w = 1
for k = base to infinity in steps of base do begin
 consume a code point, fail on endofinput
 let digit = the code point's value, fail if it has no value
+ consume a code point, or fail if there was none to consume
+ let digit = the code point's digitvalue, fail if it has none
let i = i + digit * w, fail on overflow
 let t = k <= bias ? tmin : k  bias > tmax ? tmax : k  bias
+ let t = tmin if k <= bias, tmax if k >= bias + tmax, or
+ k  bias otherwise
if digit < t then break
let w = w * (base  t), fail on overflow
end
 let bias = adapt(i  oldi, length(output) + 1, oldi == 0)
+ let bias = adapt(i  oldi, length(output) + 1, test oldi is 0?)
let n = n + i div (length(output) + 1), fail on overflow
let i = i mod (length(output) + 1)
{if n is a basic code point then fail}
insert n into output at position i
increment i
end
+
The statement enclosed in braces (checking whether n is a basic
code point) may be omitted if initial_n exceeds all basic code
points (which is true for AMCACEZ), because n is never less than
initial_n.
Because the decoder state can only advance monotonically, and there
is only one representation of any delta, there is therefore only
one encoded string that can represent a given sequence of integers.
The only error conditions are invalid code points, unexpected
 endofinput, overflow (attempts to compute values that exceed the
 maximum value of an integer variable), and basic code points encoded
 using deltas instead of appearing literally. If the decoder fails
 on these errors as shown above, then it cannot produce the same
 output for two distinct inputs, and hence it need not reencode its
 output to verify that it matches the input.
+ endofinput, overflow, and basic code points encoded using deltas
+ instead of appearing literally. If the decoder fails on these
+ errors as shown above, then it cannot produce the same output for
+ two distinct inputs, and hence it need not reencode its output to
+ verify that it matches the input.
The assignment of t, where t is clamped to the range tmin through
tmax, does not handle the case where bias < k < bias + tmin, but
that is impossible because of the way bias is computed and because
of the constraints on the parameters.
If the programming language does not provide overflow detection,
the following technique can be used. Suppose A, B, and C are
representable nonnegative integers and C is nonzero. Then A + B
overflows if and only if B > maxint  A, and A + (B * C) overflows
 if and only if B > (maxint  A) div C. See appendix C "AMCACEZ
 sample implementation" for demonstrations of this technique.
+ if and only if B > (maxint  A) div C, where maxint is the greatest
+ integer for which maxint + 1 cannot be represented. Refer to
+ appendix C "AMCACEZ sample implementation" for demonstrations of
+ this technique in the C language. See also section 6.4 "Alternative
+ methods for handling overflow".
6.3 Encoding procedure
let n = initial_n
let delta = 0
let bias = initial_bias
let h = b = the number of basic code points in the input
copy them to the output in order, followed by a delimiter if b > 0
{if the input contains a nonbasic code point < n then fail}
while h < length(input) do begin
let m = the minimum {nonbasic} code point >= n in the input
let delta = delta + (m  n) * (h + 1), fail on overflow
let n = m
 for each code point m in the input (in order) do begin
 if m < n {or m is basic} then increment delta, fail on overflow
 if m == n then begin
+ for each code point c in the input (in order) do begin
+ if c < n {or c is basic} then increment delta, fail on overflow
+ if c == n then begin
let q = delta
for k = base to infinity in steps of base do begin
 let t = k <= bias ? tmin : k  bias > tmax ? tmax : k  bias
+ let t = tmin if k <= bias, tmax if k >= bias + tmax, or
+ k  bias otherwise
if q < t then break
output the code point for digit t + ((q  t) mod (base  t))
let q = (q  t) div (base  t)
end
output the code point for digit q
 let bias = adapt(delta, h + 1, h == b)
+ let bias = adapt(delta, h + 1, test h equals b?)
let delta = 0
increment h
end
end
increment delta and n
end
+
The full statement enclosed in braces (checking whether the input
contains a nonbasic code point less than n) can be omitted if all
code points less than initial_n are basic code points (which is true
for AMCACEZ if code points are unsigned).
The braceenclosed conditions "nonbasic" and "or m is basic" can be
omitted if initial_n exceeds all basic code points (which is true
for AMCACEZ), because the code point being tested is never less
than initial_n.
@@ 487,32 +515,59 @@
code points less than initial_n are basic code points (which is true
for AMCACEZ if code points are unsigned).
The braceenclosed conditions "nonbasic" and "or m is basic" can be
omitted if initial_n exceeds all basic code points (which is true
for AMCACEZ), because the code point being tested is never less
than initial_n.
The checks for overflow are necessary to avoid producing invalid
output when the input contains very large values or is very long.
 Wider integer variables can handle more extreme inputs. For
 AMCACEZ, 26bit unsigned integers are sufficient, because any
 string that required a 27bit delta would have to exceed either
 the code point limit (0..10FFFF) or the label length limit (63
 characters).
+ Wider integer variables can handle more extreme inputs. For IDNA,
+ 26bit unsigned integers are sufficient, because any string that
+ required a 27bit delta would have to exceed either the code point
+ limit (0..10FFFF) or the label length limit (63 characters).
The increment of delta at the bottom of the outer loop cannot
overflow because delta < length(input) before the increment, and
length(input) is already assumed to be representable. The increment
of n could overflow, but only if h == length(input), in which case
the procedure is finished anyway.
+6.4 Alternative methods for handling overflow
+
+ The encoding and decoding algorithms handle overflow by detecting
+ it whenever it happens. Another approach is to enforce limits on
+ the inputs that prevent overflow from happening. For example, if
+ the encoder were to verify that no input code points exceed M and
+ that the input length does not exceed L, then no delta could ever
+ exceed (M  initial_n) * (L + 1), and hence no overflow could occur
+ if integer variables were capable of representing values that large.
+ This prevention approach would impose more restrictions on the input
+ than the detection approach does, but might be considered simpler in
+ some programming languages.
+
+ In theory, the decoder could use an analogous approach, limiting the
+ number of digits in a variablelength integer (that is, limiting the
+ number of iterations in the innermost loop). However, the number
+ of digits that suffice to represent a given delta can sometimes
+ represent much larger deltas (because of the adaptation), and hence
+ this approach would probably require integers wider than 32 bits.
+
+ Yet another approach for the decoder is to allow overflow to occur,
+ but to check the final output string by reencoding it and comparing
+ to the decoder input. If and only if they do not match (using a
+ caseinsensitive ASCII comparison) overflow has occurred. This
+ delayeddetection approach would not impose any more restrictions on
+ the input than the immediatedetection approach does, and might be
+ considered simpler in some programming languages.
+
7. AMCACEZ example strings
In the AMCACEZ encodings below, the IDNA signature prefix is not
shown. AMCACEZ is abbreviated AMCZ. Backslashes show where line
breaks have been inserted in strings too long for one line.
The first several examples are all translations of the sentence "Why
can't they just speak in ?" (courtesy of Michael Kaplan's
"provincial" page [PROVINCIAL]). Word breaks and punctuation have
been removed, as is often done in domain names.
@@ 666,21 +721,21 @@
Table Specification", 1985Oct, RFC 952.
[RFC1034] P. Mockapetris, "Domain Names  Concepts and Facilities",
1987Nov, RFC 1034.
[UNICODE] The Unicode Consortium, "The Unicode Standard",
http://www.unicode.org/unicode/standard/standard.html.
A. Author contact information
 Adam M. Costello
+ Adam M. Costello
University of California, Berkeley
http://www.cs.berkeley.edu/~amc/
B. Mixedcase annotation
In order to use AMCACEZ to represent caseinsensitive strings,
higher layers need to casefold the strings prior to AMCACEZ
encoding. The encoded string can, however, use mixed case as an
annotation telling how to convert the original folded string into a
mixedcase string for display purposes.
@@ 691,24 +746,26 @@
the last of which provides the annotation. If it is uppercase,
it is a suggestion to map the nonbasic code point to uppercase
(if possible); if it is lowercase, it is a suggestion to map the
nonbasic code point to lowercase (if possible).
AMCACEZ encoders and decoders are not required to support these
annotations, and higher layers need not use them.
C. AMCACEZ sample implementation
/******************************************/
/* amcacez.c 0.3.0 (2001Aug07Tue) */
/* Adam M. Costello */
/******************************************/
+/********************************************/
+/* amcacez.c 0.3.1 (2001Sep01Sat) */
+/* http://www.cs.berkeley.edu/~amc/charset/ */
+/* Adam M. Costello */
+/* http://www.cs.berkeley.edu/~amc/ */
+/********************************************/
/* This is ANSI C code (C89) implementing AMCACEZ version 0.3.x. */
/************************************************************/
/* Public interface (would normally go in its own .h file): */
#include
enum amc_ace_status {
amc_ace_success,
@@ 819,27 +877,43 @@
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
}
/* flagged(bcp) tests whether a basic code point is flagged */
/* (uppercase). The behavior is undefined if bcp is not a */
/* basic code point. */
#define flagged(bcp) ((amc_ace_z_uint)(bcp)  65 < 26)
/*** Useful constants ***/
+/*** Platformspecific constants ***/
/* maxint is the maximum value of an amc_ace_z_uint variable: */
static const amc_ace_z_uint maxint = 1;
+/* Because maxint is unsigned, 1 becomes the maximum value. */
/* lobase and cutoff are used in the calculation of bias: */
enum { lobase = base  tmin, cutoff = lobase * tmax / 2 };
+/*** Bias adaptation function ***/
+
+static amc_ace_z_uint adapt(
+ amc_ace_z_uint delta, amc_ace_z_uint numpoints, int firsttime )
+{
+ amc_ace_z_uint k;
+
+ delta = firsttime ? delta / damp : delta >> 1;
+ /* delta >> 1 is a faster way of doing delta / 2 */
+ delta += delta / numpoints;
+
+ for (k = 0; delta > ((base  tmin) * tmax) / 2; k += base) {
+ delta /= base  tmin;
+ }
+
+ return k + (base  tmin + 1) * delta / (delta + skew);
+}
/*** Main encode function ***/
enum amc_ace_status amc_ace_z_encode(
amc_ace_z_uint input_length,
const amc_ace_z_uint input[],
const unsigned char uppercase_flags[],
amc_ace_z_uint *output_length,
char output[] )
{
@@ 883,31 +957,27 @@
}
/* Increase delta enough to advance the decoder's */
/* state to , but guard against overflow: */
if (m  n > (maxint  delta) / (h + 1)) return amc_ace_overflow;
delta += (m  n) * (h + 1);
n = m;
for (j = 0; j < input_length; ++j) {
 #if 0
 if (input[j] < n  basic(input[j])) {
+ /* AMCACEZ does not need to check whether input[j] is basic: */
+ if (input[j] < n /*  basic(input[j]) */ ) {
if (++delta == 0) return amc_ace_overflow;
}
 #endif
 /* AMCACEZ can use this simplified version instead: */
 if (input[j] < n && ++delta == 0) return amc_ace_overflow;
if (input[j] == n) {
/* Represent delta as a generalized variablelength integer: */

for (q = delta, k = base; ; k += base) {
if (out >= max_out) return amc_ace_big_output;
t = k <= bias ? tmin : k  bias >= tmax ? tmax : k  bias;
if (q < t) break;
output[out++] = encode_digit(t + (q  t) % (base  t), 0);
q = (q  t) / (base  t);
}
output[out++] =
encode_digit(q, uppercase_flags && uppercase_flags[j]);
@@ 904,31 +974,26 @@
for (q = delta, k = base; ; k += base) {
if (out >= max_out) return amc_ace_big_output;
t = k <= bias ? tmin : k  bias >= tmax ? tmax : k  bias;
if (q < t) break;
output[out++] = encode_digit(t + (q  t) % (base  t), 0);
q = (q  t) / (base  t);
}
output[out++] =
encode_digit(q, uppercase_flags && uppercase_flags[j]);

 /* Adapt the bias: */
 delta = h == b ? delta / damp : delta >> 1;
 delta += delta / (h + 1);
 for (bias = 0; delta > cutoff; bias += base) delta /= lobase;
 bias += (lobase + 1) * delta / (delta + skew);

+ bias = adapt(delta, h + 1, h == b);
delta = 0;
++h;
}
}
+
++delta, ++n;
}
*output_length = out;
return amc_ace_success;
}
/*** Main decode function ***/
enum amc_ace_status amc_ace_z_decode(
@@ 931,22 +996,22 @@
/*** Main decode function ***/
enum amc_ace_status amc_ace_z_decode(
amc_ace_z_uint input_length,
const char input[],
amc_ace_z_uint *output_length,
amc_ace_z_uint output[],
unsigned char uppercase_flags[] )
{
 amc_ace_z_uint n, out, i, max_out, bias, b, j,
 in, oldi, w, k, delta, digit, t;
+ amc_ace_z_uint n, out, i, max_out, bias,
+ b, j, in, oldi, w, k, digit, t;
/* Initialize the state: */
n = initial_n;
out = i = 0;
max_out = *output_length;
bias = initial_bias;
/* Handle the basic code points: Let b be the number of input code */
/* points before the last delimiter, or 0 if there is none, then */
@@ 978,25 +1043,21 @@
digit = decode_digit(input[in++]);
if (digit >= base) return amc_ace_bad_input;
if (digit > (maxint  i) / w) return amc_ace_overflow;
i += digit * w;
t = k <= bias ? tmin : k  bias >= tmax ? tmax : k  bias;
if (digit < t) break;
if (w > maxint / (base  t)) return amc_ace_overflow;
w *= (base  t);
}
 /* Adapt the bias: */
 delta = oldi == 0 ? i / damp : (i  oldi) >> 1;
 delta += delta / (out + 1);
 for (bias = 0; delta > cutoff; bias += base) delta /= lobase;
 bias += (lobase + 1) * delta / (delta + skew);
+ bias = adapt(i  oldi, out + 1, oldi == 0);
/* i was supposed to wrap around from out+1 to 0, */
/* incrementing n each time, so we'll fix that now: */
if (i / (out + 1) > maxint  n) return amc_ace_overflow;
n += i / (out + 1);
i %= (out + 1);
/* Insert n at position i of the output: */
@@ 1180,11 +1240,11 @@
if (r < 0) fail(io_error);
}
return EXIT_SUCCESS;
}
usage(argv);
return EXIT_SUCCESS; /* not reached, but quiets compiler warning */
}
 INTERNETDRAFT expires 2002Feb16
+ INTERNETDRAFT expires 2002Mar04