Skip to content

Commit fcb44d8

Browse files
committed
Punycode
1 parent 1d49732 commit fcb44d8

File tree

2 files changed

+291
-0
lines changed

2 files changed

+291
-0
lines changed
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
/*
2+
* Copyright 2025 okome.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package net.siisise.lang;
17+
18+
import net.siisise.io.Packet;
19+
import net.siisise.io.PacketA;
20+
import net.siisise.math.Matics;
21+
22+
/**
23+
* RFC 3492 Punycode
24+
* 参考 https://qiita.com/msmania/items/dc0e2b8c2c5de0707435
25+
*/
26+
public class Punycode {
27+
private static final int BASE = 36;
28+
private static final int TMIN = 1;
29+
private static final int TMAX = 26;
30+
private static final int SKEW = 38;
31+
private static final int DAMP = 700;
32+
private static final int INITIAL_BIAS = 72;
33+
private static final int INITIAL_N = 128;
34+
private static final char DELIMIT = '-';
35+
36+
/**
37+
* 国際化ドメイン名をACE xn-- ASCII列に変換する
38+
* @param u
39+
* @return Punycode
40+
*/
41+
public static java.lang.String toASCII(java.lang.String u) {
42+
CodePoint cp = new CodePoint(u);
43+
if ( cp.length() >= 64 ) {
44+
throw new IllegalStateException();
45+
}
46+
int[] cpch = cp.codePoints().toArray();
47+
48+
Packet st = new PacketA();
49+
50+
// 分離とソート unicodeの大きい方から code と位置に変換
51+
do {
52+
int index = -1;
53+
int co = 0; // (n, i)
54+
for (int i = cpch.length - 1; i >= 0; i--) {
55+
if ( cpch[i] > co ) {
56+
index = i;
57+
co = cpch[i];
58+
}
59+
}
60+
if ( co < 128 ) {
61+
break;
62+
}
63+
co = co * cpch.length + index;
64+
st.backWrite(Bin.toByte(co));
65+
66+
int[] tmpch = new int[cpch.length - 1];
67+
System.arraycopy(cpch, 0, tmpch, 0, index);
68+
System.arraycopy(cpch, index+1, tmpch, index, tmpch.length - index);
69+
cpch = tmpch;
70+
} while ( true );
71+
// 残ったのがASCII
72+
StringBuilder sb = new StringBuilder();
73+
for (int c : cpch) {
74+
sb.append((char)c); // ASCIIのみ
75+
}
76+
if (sb.length() > 0) {
77+
// ASCII あり
78+
// xn-- を付ける場合 ASCII + 国際化両方あり
79+
sb.append('-');
80+
}
81+
if (st.size() == 0) { // 国際化なし ASCIIのみ
82+
return sb.toString();
83+
}
84+
85+
// delta変換
86+
int n = INITIAL_N;
87+
int bias = INITIAL_BIAS;
88+
89+
byte[] dc = new byte[4];
90+
int tn = cpch.length;
91+
int c = n * tn - 1;
92+
tn++;
93+
int d = DAMP;
94+
while (st.length() > 0) {
95+
int ostat = c + n + 1;
96+
st.read(dc);
97+
c = Bin.btoi(dc)[0];
98+
n = c / tn;
99+
int delta = c - ostat;
100+
sb.append(toCh(delta, bias)); // delta からコード
101+
bias = adapt(delta, d, tn);
102+
d = 2;
103+
tn++;
104+
}
105+
106+
return sb.toString();
107+
}
108+
109+
/**
110+
* bias の重み.
111+
* @param delta 前の差分
112+
* @param div
113+
* @param tn n番目の文字 (1開始)
114+
* @return
115+
*/
116+
private static int adapt(int delta, int div, int tn) {
117+
// 1.
118+
delta /= div;
119+
// 2.
120+
delta += delta / tn;
121+
int n = 0;
122+
while (delta > ((BASE - TMIN) * TMAX) / 2) {
123+
delta /= BASE - TMIN;
124+
n++;
125+
}
126+
return (BASE * n) + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
127+
128+
}
129+
130+
/**
131+
* 下からj桁目ぐらいの閾値 thresholds
132+
* BASE(36) * (j+1) - bias
133+
* 最小 TMIN 最大 TMAX に制限
134+
* @param j
135+
* @param bias 可変値
136+
* @return t_j
137+
*/
138+
private static int t(int j, int bias) {
139+
return Matics.range(BASE * (j+1) - bias, TMIN, TMAX);
140+
}
141+
142+
static final char[] CODE = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9'};
143+
144+
/**
145+
* 1文字デルタからコード
146+
* BASE = 36
147+
* @param n delta
148+
* @return
149+
*/
150+
private static java.lang.String toCh(int n, int bias) {
151+
StringBuilder sb = new StringBuilder();
152+
int i = 0;
153+
int k = t(i, bias);
154+
while (n >= k) {
155+
n -= k;
156+
sb.append(CODE[k + (n % (BASE - k))]);
157+
n = n / (BASE - k);
158+
i++;
159+
k = t(i, bias);
160+
}
161+
sb.append(CODE[n]);
162+
return sb.toString();
163+
}
164+
165+
public static java.lang.String toUnicode(java.lang.String a) {
166+
int delimit_index = a.lastIndexOf(DELIMIT);
167+
StringBuilder sb = new StringBuilder();
168+
if ( delimit_index >= 0) {
169+
java.lang.String ascii = a.substring(0,delimit_index);
170+
if (delimit_index == a.length() - 1) {
171+
return ascii;
172+
}
173+
sb.append(ascii);
174+
}
175+
176+
char[] ex = a.substring(delimit_index + 1).toCharArray();
177+
int of = 0;
178+
int w = 1;
179+
int bias = INITIAL_BIAS;
180+
int d = DAMP;
181+
int n = 0;
182+
int tn = sb.codePointCount(0, sb.length()) + 1;
183+
int c = INITIAL_N * tn;
184+
for (int i = 0; i < ex.length; i++) {
185+
int k = t(i - of, bias);
186+
int m = num(ex[i]);
187+
n += m * w;
188+
if (m >= k) {
189+
w *= BASE - k;
190+
} else {
191+
c += n;
192+
int idx = c % tn;
193+
c /= tn;
194+
char[] cp = Character.toChars(c);
195+
sb.insert(sb.offsetByCodePoints(0, idx), cp);
196+
197+
// 次の文字
198+
of = i+1;
199+
bias = adapt(n, d, tn);
200+
tn++;
201+
c = c*tn + idx + 1;
202+
203+
d = 2;
204+
w = 1;
205+
n = 0;
206+
}
207+
}
208+
209+
return sb.toString();
210+
}
211+
212+
static int num(char ch) {
213+
if ( ch >= 'a' && ch <= 'z') {
214+
return ch - 'a';
215+
} else if ( ch >= 'A' && ch <= 'Z') {
216+
return ch - 'A';
217+
} else if ( ch >= '0' && ch <= '9') {
218+
return ch - '0' + 26;
219+
}
220+
throw new IllegalStateException();
221+
}
222+
223+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Copyright 2025 okome.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package net.siisise.lang;
17+
18+
import org.junit.jupiter.api.Test;
19+
import static org.junit.jupiter.api.Assertions.*;
20+
21+
/**
22+
* Punycode test
23+
*/
24+
public class PunycodeTest {
25+
26+
public PunycodeTest() {
27+
}
28+
29+
/**
30+
* Test of toASCII method, of class Punycode.
31+
*/
32+
@Test
33+
public void testToASCII() {
34+
System.out.println("toASCII");
35+
java.lang.String u = "3年B組金八先生";
36+
java.lang.String expResult = "3B-ww4c5e180e575a65lsy2b";
37+
java.lang.String result = Punycode.toASCII(u);
38+
assertEquals(expResult, result);
39+
40+
u = "\u0644\u064a\u0647\u0645\u0627\u0628\u062A\u0643\u0644\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F";
41+
expResult = "egbpdaj6bu4bxfgehfvwxn";
42+
result = Punycode.toASCII(u);
43+
assertEquals(expResult, result);
44+
45+
u = "\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587";
46+
expResult = "ihqwcrb4cv8a8dqg056pqjye";
47+
result = Punycode.toASCII(u);
48+
assertEquals(expResult, result);
49+
50+
u = "安室奈美恵-with-SUPER-MONKEYS";
51+
expResult = "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n";
52+
result = Punycode.toASCII(u);
53+
assertEquals(expResult, result);
54+
}
55+
56+
/**
57+
* Test of toUnicode method, of class Punycode.
58+
*/
59+
@Test
60+
public void testToUnicode() {
61+
System.out.println("toUnicode");
62+
java.lang.String a = "3B-ww4c5e180e575a65lsy2b";
63+
java.lang.String expResult = "3年B組金八先生";
64+
java.lang.String result = Punycode.toUnicode(a);
65+
assertEquals(expResult, result);
66+
}
67+
68+
}

0 commit comments

Comments
 (0)