Line 0
Link Here
|
|
|
1 |
/* |
2 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. |
3 |
* |
4 |
* Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved. |
5 |
* |
6 |
* The contents of this file are subject to the terms of either the GNU |
7 |
* General Public License Version 2 only ("GPL") or the Common |
8 |
* Development and Distribution License("CDDL") (collectively, the |
9 |
* "License"). You may not use this file except in compliance with the |
10 |
* License. You can obtain a copy of the License at |
11 |
* http://www.netbeans.org/cddl-gplv2.html |
12 |
* or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the |
13 |
* specific language governing permissions and limitations under the |
14 |
* License. When distributing the software, include this License Header |
15 |
* Notice in each file and include the License file at |
16 |
* nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this |
17 |
* particular file as subject to the "Classpath" exception as provided |
18 |
* by Sun in the GPL Version 2 section of the License file that |
19 |
* accompanied this code. If applicable, add the following below the |
20 |
* License Header, with the fields enclosed by brackets [] replaced by |
21 |
* your own identifying information: |
22 |
* "Portions Copyrighted [year] [name of copyright owner]" |
23 |
* |
24 |
* Contributor(s): |
25 |
* |
26 |
* The Original Software is NetBeans. The Initial Developer of the Original |
27 |
* Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun |
28 |
* Microsystems, Inc. All Rights Reserved. |
29 |
* |
30 |
* If you wish your version of this file to be governed by only the CDDL |
31 |
* or only the GPL Version 2, indicate your decision by adding |
32 |
* "[Contributor] elects to include this software in this distribution |
33 |
* under the [CDDL or GPL Version 2] license." If you do not indicate a |
34 |
* single choice of license, a recipient has the option to distribute |
35 |
* your version of this file under either the CDDL, the GPL Version 2 or |
36 |
* to extend the choice of license to its licensees as provided above. |
37 |
* However, if you add GPL Version 2 code and therefore, elected the GPL |
38 |
* Version 2 license, then the option applies only if the new code is |
39 |
* made subject to such option by the copyright holder. |
40 |
*/ |
41 |
|
42 |
package org.netbeans.editor; |
43 |
|
44 |
/** |
45 |
* This is an implementation of wcwidth() and wcswidth() (defined in |
46 |
* IEEE Std 1002.1-2001) for Unicode.<br> |
47 |
* |
48 |
* <a href="http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html"> |
49 |
* http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html</a><br> |
50 |
* <a href="http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html"> |
51 |
* http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html</a></p> |
52 |
* |
53 |
* <p>In fixed-width output devices, Latin characters all occupy a single |
54 |
* "cell" position of equal width, whereas ideographic CJK characters |
55 |
* occupy two such cells. Interoperability between terminal-line |
56 |
* applications and (teletype-style) character terminals using the |
57 |
* UTF-8 encoding requires agreement on which character should advance |
58 |
* the cursor by how many cell positions. No established formal |
59 |
* standards exist at present on which Unicode character shall occupy |
60 |
* how many cell positions on character terminals. These routines are |
61 |
* a first attempt of defining such behavior based on simple rules |
62 |
* applied to data provided by the Unicode Consortium.</p> |
63 |
* |
64 |
* <p>For some graphical characters, the Unicode standard explicitly |
65 |
* defines a character-cell width via the definition of the East Asian |
66 |
* FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. |
67 |
* In all these cases, there is no ambiguity about which width a |
68 |
* terminal shall use. For characters in the East Asian Ambiguous (A) |
69 |
* class, the width choice depends purely on a preference of backward |
70 |
* compatibility with either historic CJK or Western practice. |
71 |
* Choosing single-width for these characters is easy to justify as |
72 |
* the appropriate long-term solution, as the CJK practice of |
73 |
* displaying these characters as double-width comes from historic |
74 |
* implementation simplicity (8-bit encoded characters were displayed |
75 |
* single-width and 16-bit ones double-width, even for Greek, |
76 |
* Cyrillic, etc.) and not any typographic considerations.</p> |
77 |
* |
78 |
* <p>Much less clear is the choice of width for the Not East Asian |
79 |
* (Neutral) class. Existing practice does not dictate a width for any |
80 |
* of these characters. It would nevertheless make sense |
81 |
* typographically to allocate two character cells to characters such |
82 |
* as for instance EM SPACE or VOLUME INTEGRAL, which cannot be |
83 |
* represented adequately with a single-width glyph. The following |
84 |
* routines at present merely assign a single-cell width to all |
85 |
* neutral characters, in the interest of simplicity. This is not |
86 |
* entirely satisfactory and should be reconsidered before |
87 |
* establishing a formal standard in this area. At the moment, the |
88 |
* decision which Not East Asian (Neutral) characters should be |
89 |
* represented by double-width glyphs cannot yet be answered by |
90 |
* applying a simple rule from the Unicode database content. Setting |
91 |
* up a proper standard for the behavior of UTF-8 character terminals |
92 |
* will require a careful analysis not only of each Unicode character, |
93 |
* but also of each presentation form, something the author of these |
94 |
* routines has avoided to do so far.</p> |
95 |
* |
96 |
* <p><a href="http://www.unicode.org/unicode/reports/tr11/"> |
97 |
* http://www.unicode.org/unicode/reports/tr11/</a></p> |
98 |
* |
99 |
* <p>Markus Kuhn -- 2007-05-26 (Unicode 5.0)</p> |
100 |
* |
101 |
* <p>Permission to use, copy, modify, and distribute this software |
102 |
* for any purpose and without fee is hereby granted. The author |
103 |
* disclaims all warranties with regard to this software.</p> |
104 |
* |
105 |
* Latest version: <a href="http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"> |
106 |
* http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c</a> |
107 |
* |
108 |
* @author johnsonlau@netbeans.org |
109 |
*/ |
110 |
public class WcwdithUtil { |
111 |
|
112 |
/** sorted list of non-overlapping intervals of non-spacing characters.*/ |
113 |
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ |
114 |
static final int[][] nonSpacingCharTable = { |
115 |
{0x0300, 0x036F}, {0x0483, 0x0486}, {0x0488, 0x0489}, |
116 |
{0x0591, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2}, |
117 |
{0x05C4, 0x05C5}, {0x05C7, 0x05C7}, {0x0600, 0x0603}, |
118 |
{0x0610, 0x0615}, {0x064B, 0x065E}, {0x0670, 0x0670}, |
119 |
{0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED}, |
120 |
{0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A}, |
121 |
{0x07A6, 0x07B0}, {0x07EB, 0x07F3}, {0x0901, 0x0902}, |
122 |
{0x093C, 0x093C}, {0x0941, 0x0948}, {0x094D, 0x094D}, |
123 |
{0x0951, 0x0954}, {0x0962, 0x0963}, {0x0981, 0x0981}, |
124 |
{0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD}, |
125 |
{0x09E2, 0x09E3}, {0x0A01, 0x0A02}, {0x0A3C, 0x0A3C}, |
126 |
{0x0A41, 0x0A42}, {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, |
127 |
{0x0A70, 0x0A71}, {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, |
128 |
{0x0AC1, 0x0AC5}, {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, |
129 |
{0x0AE2, 0x0AE3}, {0x0B01, 0x0B01}, {0x0B3C, 0x0B3C}, |
130 |
{0x0B3F, 0x0B3F}, {0x0B41, 0x0B43}, {0x0B4D, 0x0B4D}, |
131 |
{0x0B56, 0x0B56}, {0x0B82, 0x0B82}, {0x0BC0, 0x0BC0}, |
132 |
{0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C48}, |
133 |
{0x0C4A, 0x0C4D}, {0x0C55, 0x0C56}, {0x0CBC, 0x0CBC}, |
134 |
{0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD}, |
135 |
{0x0CE2, 0x0CE3}, {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, |
136 |
{0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, |
137 |
{0x0E31, 0x0E31}, {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, |
138 |
{0x0EB1, 0x0EB1}, {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, |
139 |
{0x0EC8, 0x0ECD}, {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, |
140 |
{0x0F37, 0x0F37}, {0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, |
141 |
{0x0F80, 0x0F84}, {0x0F86, 0x0F87}, {0x0F90, 0x0F97}, |
142 |
{0x0F99, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, |
143 |
{0x1032, 0x1032}, {0x1036, 0x1037}, {0x1039, 0x1039}, |
144 |
{0x1058, 0x1059}, {0x1160, 0x11FF}, {0x135F, 0x135F}, |
145 |
{0x1712, 0x1714}, {0x1732, 0x1734}, {0x1752, 0x1753}, |
146 |
{0x1772, 0x1773}, {0x17B4, 0x17B5}, {0x17B7, 0x17BD}, |
147 |
{0x17C6, 0x17C6}, {0x17C9, 0x17D3}, {0x17DD, 0x17DD}, |
148 |
{0x180B, 0x180D}, {0x18A9, 0x18A9}, {0x1920, 0x1922}, |
149 |
{0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193B}, |
150 |
{0x1A17, 0x1A18}, {0x1B00, 0x1B03}, {0x1B34, 0x1B34}, |
151 |
{0x1B36, 0x1B3A}, {0x1B3C, 0x1B3C}, {0x1B42, 0x1B42}, |
152 |
{0x1B6B, 0x1B73}, {0x1DC0, 0x1DCA}, {0x1DFE, 0x1DFF}, |
153 |
{0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x2063}, |
154 |
{0x206A, 0x206F}, {0x20D0, 0x20EF}, {0x302A, 0x302F}, |
155 |
{0x3099, 0x309A}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, |
156 |
{0xA825, 0xA826}, {0xFB1E, 0xFB1E}, {0xFE00, 0xFE0F}, |
157 |
{0xFE20, 0xFE23}, {0xFEFF, 0xFEFF}, {0xFFF9, 0xFFFB}, |
158 |
{0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, |
159 |
{0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, {0x1D167, 0x1D169}, |
160 |
{0x1D173, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, |
161 |
{0x1D242, 0x1D244}, {0xE0001, 0xE0001}, {0xE0020, 0xE007F}, |
162 |
{0xE0100, 0xE01EF} |
163 |
}; |
164 |
|
165 |
/** |
166 |
* Returns defined column width of a Unicode character. |
167 |
* |
168 |
* <p>The following functions defines the column width of an ISO 10646 |
169 |
* character as follows:</P> |
170 |
* |
171 |
* <ol> |
172 |
* <li>The null character (U+0000) has a column width of 0. </li></li> |
173 |
* |
174 |
* <li> Other C0/C1 control characters and DEL will lead to a return |
175 |
* value of -1.</li> |
176 |
* |
177 |
* <li> Non-spacing and enclosing combining characters (general |
178 |
* category code Mn or Me in the Unicode database) have a |
179 |
* column width of 0.</li> |
180 |
* |
181 |
* <li> SOFT HYPHEN (U+00AD) has a column width of 1.</li> |
182 |
* |
183 |
* <li> Other format characters (general category code Cf in the Unicode |
184 |
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.</li> |
185 |
* |
186 |
* <li> Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) |
187 |
* have a column width of 0.</li> |
188 |
* |
189 |
* <li> Spacing characters in the East Asian Wide (W) or East Asian |
190 |
* Full-width (F) category as defined in Unicode Technical |
191 |
* Report #11 have a column width of 2.</li> |
192 |
* |
193 |
* <li> All remaining characters (including all printable |
194 |
* ISO 8859-1 and WGL4 characters, Unicode control characters, |
195 |
* etc.) have a column width of 1.</li> |
196 |
* </ol> |
197 |
* |
198 |
* @param codePoint the character (Unicode code point) in question. |
199 |
* @return column width of the character. |
200 |
*/ |
201 |
public static int wcwidth(int codePoint) { |
202 |
/* test for 8-bit control characters */ |
203 |
if (codePoint == 0) { |
204 |
return 0; |
205 |
} |
206 |
|
207 |
if (Character.isISOControl(codePoint)) { |
208 |
return -1; |
209 |
} |
210 |
|
211 |
/* binary search in table of non-spacing characters */ |
212 |
if (binarySearch(codePoint)) { |
213 |
return 0; |
214 |
} |
215 |
|
216 |
/* if we arrive here, ucs is not a combining or C0/C1 control character */ |
217 |
return 1 + ((codePoint >= 0x1100 && |
218 |
(codePoint <= 0x115f || /* Hangul Jamo init. consonants */ |
219 |
codePoint == 0x2329 || codePoint == 0x232a || |
220 |
(codePoint >= 0x2e80 && codePoint <= 0xa4cf && |
221 |
codePoint != 0x303f) || /* CJK ... Yi */ |
222 |
(codePoint >= 0xac00 && codePoint <= 0xd7a3) || /* Hangul Syllables */ |
223 |
(codePoint >= 0xf900 && codePoint <= 0xfaff) || /* CJK Compatibility Ideographs */ |
224 |
(codePoint >= 0xfe10 && codePoint <= 0xfe19) || /* Vertical forms */ |
225 |
(codePoint >= 0xfe30 && codePoint <= 0xfe6f) || /* CJK Compatibility Forms */ |
226 |
(codePoint >= 0xff00 && codePoint <= 0xff60) || /* Fullwidth Forms */ |
227 |
(codePoint >= 0xffe0 && codePoint <= 0xffe6) || |
228 |
(codePoint >= 0x20000 && codePoint <= 0x2fffd) || |
229 |
(codePoint >= 0x30000 && codePoint <= 0x3fffd))) ? 1 : 0); |
230 |
} |
231 |
|
232 |
/** |
233 |
* Searches if the codepoint is in the range of non-spacing character table. |
234 |
* |
235 |
* @param codePoint the current character (Unicode code point) in question. |
236 |
* @return true if it is a non-spacing character, otherwise false. |
237 |
*/ |
238 |
private static boolean binarySearch(int codePoint) { |
239 |
int max = nonSpacingCharTable.length - 1; |
240 |
|
241 |
if (codePoint < nonSpacingCharTable[0][0] || codePoint > nonSpacingCharTable[max][1]) { |
242 |
return false; |
243 |
} |
244 |
|
245 |
int min = 0; |
246 |
while (max >= min) { |
247 |
int mid = (min + max) / 2; |
248 |
if (codePoint > nonSpacingCharTable[mid][1]) { |
249 |
min = mid + 1; |
250 |
} else if (codePoint < nonSpacingCharTable[mid][0]) { |
251 |
max = mid - 1; |
252 |
} else { |
253 |
return true; |
254 |
} |
255 |
} |
256 |
|
257 |
return false; |
258 |
} |
259 |
|
260 |
/** |
261 |
* Controls how to count the column width. |
262 |
*/ |
263 |
public interface CountingAlgorithm { |
264 |
/** |
265 |
* Returns the calculated column count result. |
266 |
* |
267 |
* @param codePoint the current character (Unicode code point) in question. |
268 |
* @param count the calculated result before counting current character. |
269 |
* @param w default column width of current character. |
270 |
* @return the calculated result. Return -1 to end the whole counting process. |
271 |
*/ |
272 |
public int calculate(int codePoint, int count, int w); |
273 |
} |
274 |
|
275 |
/** |
276 |
* A counting algorithm that would ends the whole counting and return -1 when |
277 |
* an ISO control character is met. |
278 |
*/ |
279 |
public final static CountingAlgorithm ISO_CONTROL_TERMINATE = new CountingAlgorithm() { |
280 |
|
281 |
public int calculate(int codePoint, int count, int w) { |
282 |
if (w < 0) { |
283 |
return -1; |
284 |
} |
285 |
return count + w; |
286 |
} |
287 |
|
288 |
}; |
289 |
|
290 |
/** |
291 |
* A counting algorithm that would ignore an ISO control character. |
292 |
*/ |
293 |
public final static CountingAlgorithm ISO_CONTROL_IGNORE = new CountingAlgorithm() { |
294 |
|
295 |
public int calculate(int codePoint, int count, int w) { |
296 |
if (w < 0) { |
297 |
w = 0; |
298 |
} |
299 |
return count + w; |
300 |
} |
301 |
|
302 |
}; |
303 |
|
304 |
/** |
305 |
* Returns column width of a string. |
306 |
* |
307 |
* @param s the string to count. |
308 |
* @param countingAlgorithm counting algorithm. |
309 |
* @return column width of the questioned string. |
310 |
*/ |
311 |
public static int wcswidth(String s, CountingAlgorithm countingAlgorithm) { |
312 |
char[] array = s.toCharArray(); |
313 |
int count = 0; |
314 |
for (int i = 0; i < array.length; ++i) { |
315 |
int codePoint; |
316 |
if (Character.isHighSurrogate(array[i])) { |
317 |
codePoint = Character.toCodePoint(array[i], array[i + 1]); |
318 |
i++; |
319 |
} else { |
320 |
codePoint = array[i]; |
321 |
} |
322 |
int w = wcwidth(codePoint); |
323 |
count = countingAlgorithm.calculate(codePoint, count, w); |
324 |
if (count < 0) { |
325 |
return count; |
326 |
} |
327 |
} |
328 |
|
329 |
return count; |
330 |
} |
331 |
|
332 |
} |