在java代码中对字符进行繁简体转换和全半角转换

2010年12月12日

在java代码中对字符进行繁简体转换和全半角转换

由于java代码本身使用unicode代码表示,因此在java代码内部不需要考虑GB2312,GBK,GB18030,BIG5等字符编码集,
只要定义Unicode字符对应关系,即可实现繁体到简体,简体到繁体,全角到半角,半角到全角的字符转换。
但需要注意的是:
1.繁简体转换时,有些字符不是简单的一一对应关系,因此不能简单的靠字符对应关系进行简繁体转换。
2.从java5.0开始的jdk版本支持Unicode标准为Unicode 4.0.0,在JDK7.0将支持Unicode 6.0.0,而部分简繁体字符已经是Unicode里的增补字符,因此需要支持以代码点方式进行转换。
根据维基百科内容整理后封装的Chars类代码如下:

[code]
package com.lizongbo.common.primitives;

import gnu.trove.map.TIntIntMap;
import gnu.trove.map.hash.TIntIntHashMap;
/**
* 字符处理的工具类,提繁简体,简繁体,全半角,半全角的转换,以及char对象的缓存
* @author lizongbo
*/
public class Chars {
/** 字符对象的缓存 */
private static final Character CharacterCache[] = new Character[Character.MAX_VALUE];
/**
* 繁体转简体的映射表
*
* @see http://zh.wikipedia.org/zh-cn/Wikipedia:Unihan%E7%B9%81%E7%AE%80%E4%BD%93%E5%AF%B9%E7%85%A7%E8%A1%A8/%E7%B9%81%E7%AE%80%E4%B8%80%E4%B8%80%E5%AF%B9%E5%BA%94%E8%A1%A8
*/
private static final TIntIntMap cht2chsMap = new TIntIntHashMap(3096);
/**
* 简体转繁体的映射表
*
* @see http://zh.wikipedia.org/zh-cn/Wikipedia:Unihan%E7%B9%81%E7%AE%80%E4%BD%93%E5%AF%B9%E7%85%A7%E8%A1%A8/%E7%AE%80%E7%B9%81%E4%B8%80%E4%B8%80%E5%AF%B9%E5%BA%94%E8%A1%A8
* */
private static final TIntIntMap chs2chtMap = new TIntIntHashMap(3096);

private static final int[][] getCht2chsArr() {
int[][] cht2chsArr = new int[][] { { 0x042b7, 0x04336 },
{ 0x042d9, 0x0433a }, { 0x0477c, 0x0478d },
{ 0x04c3e, 0x09c83 }, { 0x04c81, 0x09cda },
{ 0x04e1f, 0x04e22 }, { 0x04e26, 0x05e76 },
{ 0x04e7e, 0x05e72 }, { 0x04e82, 0x04e71 },
//省略,参考java源代码文件......
{ 0x27a59, 0x04725 }, { 0x27d73, 0x0478c },
{ 0x282e2, 0x04882 } };
return cht2chsArr;
}

private static final int[][] getChs2chtArr() {
return new int[][] { { 0x0359e, 0x0558e }, { 0x039d1, 0x0649d },
{ 0x039df, 0x064d3 }, { 0x03c6e, 0x06ba8 },
{ 0x04056, 0x0779c }, { 0x041f2, 0x07b74 },
{ 0x04336, 0x042b7 }, { 0x04337, 0x07d2c },
{ 0x04338, 0x07e33 }, { 0x04339, 0x07d45 },
{ 0x0433a, 0x042d9 }, { 0x04341, 0x07e78 },
{ 0x04723, 0x08a22 }, { 0x04725, 0x27a59 },
{ 0x0478c, 0x27d73 }, { 0x0478d, 0x0477c },
//省略,参考java源代码文件......
{ 0x21484, 0x058c8 } };
}

/**
* 根据char值获取缓存的Character对象
*
* @param c
* @return
*/
public static Character getCharacter(char c) {
return CharacterCache[c];
}

static {
initAll();
}

private static void initAll() {
initCharCache();
initChs2ChtMap();
initCht2ChsMap();
}

private static void initCharCache() {
for (int i = 0; i < CharacterCache.length; i++) {
CharacterCache[i] = new Character((char) i);
}
}

private static void initChs2ChtMap() {
int[][] chs2chtArr = getChs2chtArr();
for (int i = 0; i < chs2chtArr.length; i++) {
int[] cm = chs2chtArr[i];
Chars.chs2chtMap.put(cm[0], cm[1]);
}
}

private static void initCht2ChsMap() {
int[][] cht2chsArr = getCht2chsArr();
for (int i = 0; i < cht2chsArr.length; i++) {
int[] cm = cht2chsArr[i];
Chars.cht2chsMap.put(cm[0], cm[1]);
}
}

/**
* 繁体字符转换成简体字符,只处理一一对应关系,不考虑一多对应关系
*
* @param c
* @return
*/
public static char cht2chs(char c) {
return (char) cht2chs((int) c);
}

/**
* 以代码点方式将繁体字符转换成简体字符,例如:U+282e2转换成U+04882,只处理一一对应关系,不考虑一多对应关系
*
* @param codePoint
* @return
*/
public static int cht2chs(int codePoint) {
int r = cht2chsMap.get(codePoint);
return r > 0 ? r : codePoint;
}

/**
* 将简体字符转换成繁体字符,只处理一一对应关系,不考虑一多对应关系
*
* @param c
* @return
*/
public static char chs2cht(char c) {
return (char) chs2cht((int) c);
}

/**
* 以代码点方式将简体字符转换成繁体字符,例如U+21484转换成U+058c8,只处理一一对应关系,不考虑一多对应关系
*
* @param codePoint
* @return
*/
public static int chs2cht(int codePoint) {
int r = chs2chtMap.get(codePoint);
return r > 0 ? r : codePoint;
}

/**
* 全角字符转换为半角字符,不考虑日语和韩文的全角字符
*
* @see http://zh.wikipedia.org/zh-cn/%E5%85%A8%E8%A7%92
*
* @param c
* @return
*/
public static char quan2ban(char c) {
return (char) quan2ban(c);
}

/**
* 以代码点方式将全角字符转换为半角字符,不考虑日语和韩文的全角字符
*
* @param codePoint
* @return
*/
public static int quan2ban(int codePoint) {
if (codePoint >= 0xFF01 && codePoint <= 0xff65) {
return (codePoint - 65248);
}
if (codePoint == 12288) // 全角空格
return 32;
if (codePoint == 12290) // "。"
return 46;
return codePoint;
}

/**
* 将半角字符转换成全角字符,不考虑日语和韩文的半角字符
*
* @param c
* @return
*/
public static char ban2quan(char c) {
return (char) ban2quan(c);
}

/**
* 以代码点方式将全角字符转换为半角字符,不考虑日语和韩文的半角字符
*
* @param codePoint
* @return
*/
public static int ban2quan(int codePoint) {
if (codePoint >= 0xFF01 && codePoint <= 0xff65) {
return (codePoint - 65248);
}
if (codePoint == 32) // 全角空格
return 12288;
if (codePoint == 46) // 全角 "。"
return 12290;
return codePoint;
}

/**
* @param args
*/
public static void main(String[] args) {
int[][] chs2chtArr = getChs2chtArr();
for (int i = 0; i < chs2chtArr.length; i++) {
int[] cm = chs2chtArr[i];
int codePointCht = Chars.chs2cht(cm[0]);
System.out.println(":" + new String(new int[] { cm[0] }, 0, 1)
+ ",:"
+ new String(new int[] { Chars.chs2cht(cm[0]) }, 0, 1)
+ ",codePointCht==" + Integer.toHexString(codePointCht)
+ ",codePointChs==" + Integer.toHexString(cm[0]));
}
int[][] cht2chsArr = getCht2chsArr();
for (int i = 0; i < cht2chsArr.length; i++) {
int[] cm = cht2chsArr[i];
int codePointChs = Chars.cht2chs(cm[0]);
System.out.println("繁体:" + new String(new int[] { cm[0] }, 0, 1)
+ ",简体:" + new String(new int[] { codePointChs }, 0, 1)
+ ",codePointChs==" + Integer.toHexString(codePointChs)
+ ",codePointCht==" + Integer.toHexString(cm[0]));
}

for (char c = 0xFF01; c <= 0xff65; c++) {
System.out.println("全角字符:" + c + ",半角字符:" + Chars.quan2ban(c));
}
for (char c = 0x20; c <= 0x7e; c++) {
System.out.println("半角字符:" + c + ",全角字符:" + Chars.ban2quan(c));
}
}
}

[/code]

相关参考信息:
Java 平台中的增补字符:

http://java.sun.com/developer/technicalArticles/Intl/Supplementary/index_zh_CN.html

维基百科:Unihan繁简体对照表:
http://zh.wikipedia.org/zh-cn/Wikipedia:Unihan%E7%B9%81%E7%AE%80%E4%BD%93%E5%AF%B9%E7%85%A7%E8%A1%A8
Wikipedia:Unihan繁简体对照表/简繁一一对应表:
http://zh.wikipedia.org/zh-cn/Wikipedia:Unihan%E7%B9%81%E7%AE%80%E4%BD%93%E5%AF%B9%E7%85%A7%E8%A1%A8/%E7%AE%80%E7%B9%81%E4%B8%80%E4%B8%80%E5%AF%B9%E5%BA%94%E8%A1%A8
Wikipedia:Unihan繁简体对照表/繁简一一对应表:
http://zh.wikipedia.org/zh-cn/Wikipedia:Unihan%E7%B9%81%E7%AE%80%E4%BD%93%E5%AF%B9%E7%85%A7%E8%A1%A8/%E7%B9%81%E7%AE%80%E4%B8%80%E4%B8%80%E5%AF%B9%E5%BA%94%E8%A1%A8
全角和半角:

http://zh.wikipedia.org/zh-cn/%E5%85%A8%E8%A7%92

JDK7.0特性列表:

http://openjdk.java.net/projects/jdk7/features/#f497

Unicode6.0.0:

http://unicode.org/versions/Unicode6.0.0/

Unicode 半角与全角的字符表:

http://unicode.org/charts/PDF/UFF00.pdf

完整的Chars.java的源代码:

http://mqq.im/docs/java/com/lizongbo/common/primitives/Chars.java

Tags: , , , , , ,

使用java.net.URL解析校验检查url非法字符时撞上了bug

2010年12月7日

..URL对url格式的检查不严格,如果使用java..URL来进行url解析并判断url是否为指定域名时将产生漏洞。

目前一共发现两种情况会解析错误:
1.java.net.URL对url里存在回车符和换行符被认为是合法的:
<%
String goUrl=”http://618119.com/\r\nX-Location: http://www.lizongbo.com/”;
//goUrl=java.net.URLEncoder.encode(goUrl, “UTF-8″);
response.sendRedirect(goUrl);
%>
例如上面的代码即使使用java.net.URL进行解析,也能正常解析,而被认为是个合法的url。
加上reponse.setheader的时候没做参数检查,导致写入了非法的head,这样会导致XSS注入攻击。

2.”http://618119.com#www.lizongbo.com/”
这样的url被java.net.URL解析得到的host是618119.com#www.lizongbo.com,因此按域名后缀判断的话会被误放过,
在浏览器地址栏里实际请求会变成:http://618119.com/#www.lizongbo.com/

这样也会产生非法跳转漏洞。

使用java.net.URI进行解析则不会出现这样的问题。

JDK里的关于java.net.URL里引用的文章连接 为: http://www.socs.uts.edu.au/MosaicDocs-old/url-primer.html,但是这个链接已经失效了。

因此封装下面这个工具类来对url进行检查,避免URL 参数里出现非法字符导致的非法跳转的漏洞:

[code]

package com.lizongbo.net;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.;
/**
* 测试url检查是否完善的类
* @author lizongbo
*
*/
public class UrlTest {

/**
* @param args
* @throws MalformedURLException
* @throws UnsupportedEncodingException
* @throws URISyntaxException
*/
public static void main(String[] args) throws MalformedURLException,
UnsupportedEncodingException, URISyntaxException {
String urlStr = "http://618119.com/\r\nX-Location: http://www.lizongbo.com/";
checkUrl(urlStr);
urlStr = "http://618119.com#www.lizongbo.com/";
checkUrl(urlStr);
urlStr = "https://www.google.com/reader/view/#stream/feed%2Fhttp%3A%2F%2Fwww.lizongbo.com%2Ffeeds%2Fposts%2Fdefault";
checkUrl(urlStr);
checkUrl("file:C:/autoexec.bat");
checkUrl("file:/C:/autoexec.bat");
checkUrl("file://C:/autoexec.bat");
checkUrl("file:///C:/autoexec.bat");
checkUrl("/aa.jsp");
}

private static void checkUrl(String urlStr) {
try {
java.net.URI uri = new URI(urlStr);
dump(uri);
} catch (Exception e) {
e.printStackTrace();
}
try {
java.net.URL url = new URL(urlStr);
dump(url);
} catch (Exception e) {
e.printStackTrace();
}
}

private static void dump(java.net.URL url) {
try {
System.out.println("url=" + url + ",protocol=" + url.getProtocol()
+ ",host=" + url.getHost() + ",path=" + url.getPath()
+ ",query=" + url.getQuery() + ",ref=" + url.getRef()
+ ",url.toURI=" + url.toURI());
} catch (URISyntaxException e) {
e.printStackTrace();
}
}

private static void dump(java.net.URI uri) {
try {
System.out.println("uri=" + uri + ",scheme=" + uri.getScheme()
+ ",host=" + uri.getHost() + ",path=" + uri.getPath()
+ ",query=" + uri.getQuery() + ",fragment="
+ uri.getFragment() + ",uri.toURL=" + uri.toURL());
} catch (MalformedURLException e) {
e.printStackTrace();
}
}

/**
* 使用java.net.URI判断指定的url是否为站内合法的目标地址,对url内容进行严格检查
*
* @param goUrl
* @return
*/
public static boolean verifyURL(String goUrl) {
if (goUrl == null) {
return false;
}
java.net.URI cgoUrl = null;
try {
cgoUrl = new java.net.URI(goUrl);// 不使用java.net.URL,而是改用URI进行校验
} catch (Exception e) {
return false;
}
if (!"http".equalsIgnoreCase(cgoUrl.getScheme())
&& !"https".equalsIgnoreCase(cgoUrl.getScheme())) {
return false;
}
String hostString = cgoUrl.getHost();
if (hostString == null) {
return false;
}
hostString = hostString.toLowerCase();
String allowDomains = ".618119.com;.lizongbo.com;.mqq.im;.seotijian.com";
if (allowDomains.length() > 0) {
String[] domains = allowDomains.split(";");
for (int i = 0; i < domains.length; i++) {
String dmTmp = domains[i];
if (dmTmp != null && dmTmp.length() > 0
&& hostString.endsWith(dmTmp)) {
return true;
}
}
}
return false;
}

/**
* 使用判断java.net.URL指定的url是否为站内合法的目标地址,针对特殊url的判断存在bug<br/>
* "http://618119.com/\r\nX-Location: http://www.lizongbo.com/"<br/>
* "http://www.lizongbo.com#618119.com/" 这两种会绕过检查,导致安全漏洞<br/>
*
* @param goUrl
* @return
*/
public static boolean verifyURLOld(String goUrl) {
if (goUrl == null) {
return false;
}
java.net.URL cgoUrl = null;
try {
cgoUrl = new java.net.URL(goUrl);// 改用URI进行校验
} catch (Exception e) {
return false;
}
if (!"http".equalsIgnoreCase(cgoUrl.getProtocol())
&& !"https".equalsIgnoreCase(cgoUrl.getProtocol())) {
return false;
}
String hostString = cgoUrl.getHost().toLowerCase();
String allowDomains = ".618119.com;.lizongbo.com;.mqq.im";
if (allowDomains.length() > 0) {
String[] domains = allowDomains.split(";");
for (int i = 0; i < domains.length; i++) {
String dmTmp = domains[i];
if (dmTmp != null && dmTmp.length() > 0
&& hostString.endsWith(dmTmp)) {
return true;
}
}
}
return false;
}
}

[code]

Tags: , , ,

使用CRLFFilter过滤HTTP应答头信息名称和值的非法字符防止CRLF注入攻击

2010年11月29日

使用CRLFFilter过滤http应答中头信息名称和值的非法字符,防止CRLF注入攻击
经过测试Resin的response.addHeader方法也没做header名字和值的检查,因此如果webapp代码写法不当的话,将导致CRLF注入攻击,

例如一个页面从url参数中获取地址燃尽进行跳转,如果url地址存在“%0d%0a”编码表示的CRLF而未被检测过滤(..URL解析不会出错,必须用java..URI才行),将产生安漏洞。

通过下面的代码可以重现这个Xss漏洞攻击。
jsp代码:
<%
response.addHeader(“X-Locationaaa: http://mqq.im/\r\nX-tesh”,”aaa”);
response.addHeader(“X-Locationbbb: 汉字/\r\nX-teshbbb”,”aaa”);

//下面的goUrl可以从URL的参数中获取,如果url地址存在“%0d%0a”编码表示的CRLF而未被检测过滤(java.net.URL解析不会出错,必须用java.net.URI才行),将产生漏洞。

String goUrl=”http://lizongbo.com/\r\nX-Location: http://618119.com/”;
//goUrl=java.net.URLEncoder.encode(goUrl, “UTF-8″);
response.sendError(403,goUrl);
%>

在Firefox中访问jsp,使用Live HTTP headers 可以看到生成的实际head如下:

HTTP/1.1 403 http://lizongbo.com/
X-Location: http://618119.com/
Server: Resin/4.0.10
X-Locationaaa: http://mqq.im/
X-tesh: aaa
X-Locationbbb: 汉字
X-teshbbb: aaa
Content-Type: text/html; charset=utf-8
Content-Length: 216
Date: Tue, 09 Nov 2010 02:37:48 GMT

因此封装过滤器代码如下:
[code]
package com.lizongbo.web.;

import java.io.IOException;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletResponse;

public class CRLFFilter implements Filter {

@Override
public void destroy() {

}

@Override
public void doFilter(ServletRequest req, ServletResponse res,
FilterChain chain) throws IOException, ServletException {
HttpServletResponse response = new CRLFFilterResponseWrapper(
(HttpServletResponse) res);
chain.doFilter(req, response);
}

@Override
public void init(FilterConfig config) throws ServletException {

}

}

[/code]

[code]
package com.lizongbo.web.filter;

import java.io.IOException;
import java.util.Arrays;

import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpServletResponseWrapper;

public class CRLFFilterResponseWrapper extends HttpServletResponseWrapper {
/**
* http name 不允许出现的字符
*/
private static final char[] headerName_tspecials = new char[] { '(', ')',
'<', '>', '@', ',', ';', ':', '\\', '\"', '/', '[', ']', '?', '=',
'{', '}' };
static {
Arrays.sort(headerName_tspecials);
}

HttpServletResponse response = null;

public CRLFFilterResponseWrapper(HttpServletResponse response)
throws IOException {
super(response);
this.response = response;
}

@Override
public void addHeader(String name, String value) {
super.addHeader(filterHeaderName(name), filterHeaderValue(value));
}

@Override
public void sendError(int sc, String msg) throws IOException {
super.sendError(sc, filterHeaderValue(msg));
}

@Override
public void sendRedirect(String location) throws IOException {
super.sendRedirect(filterHeaderValue(location));
}

@Override
public void setHeader(String name, String value) {
super.setHeader(filterHeaderName(name), filterHeaderValue(value));
}

@Override
public void setStatus(int sc, String sm) {
super.setStatus(sc, filterHeaderValue(sm));
}

@Override
public void addDateHeader(String name, long date) {
super.addDateHeader(filterHeaderName(name), date);
}

@Override
public void addIntHeader(String name, int value) {
super.addIntHeader(filterHeaderName(name), value);
}

@Override
public void setDateHeader(String name, long date) {
super.setDateHeader(filterHeaderName(name), date);
}

@Override
public void setIntHeader(String name, int value) {
super.setIntHeader(filterHeaderName(name), value);
}
@Override
public void setContentType(String contentType) {
super.setContentType(filterHeaderValue(contentType));
}

/**
*过滤头信息名字中的非法字符,避免CRLF注入攻击

Many HTTP/1.1 header field values consist of words separated by LWS<br/>
or special characters. These special characters MUST be in a quoted<br/>
string to be used within a parameter value.<br/>

token          = 1*<any CHAR except CTLs or tspecials><br/>

tspecials      = "(" | ")" | "<" | ">" | "@"<br/>
| "," | ";" | ":" | "\" | <"><br/>
| "/" | "[" | "]" | "?" | "="<br/>
| "{" | "}" | SP | HT <br/>
CTL            = <any US-ASCII control character<br/>
(octets 0 - 31) and DEL (127)><br/>
SP             = <US-ASCII SP, space (32)><br/>
HT             = <US-ASCII HT, horizontal-tab (9)><br/>

* @param name
* @return
*/
private static String filterHeaderName(String name) {
if (name == null || name.length() < 1) {
return "null";
}
StringBuilder sb = new StringBuilder(name.length());
for (int i = 0; i < name.length(); i++) {
char c = name.charAt(i);
if (c > 32 && c < 127
&& Arrays.binarySearch(headerName_tspecials, c) < 0) {
sb.append(c);
}
}
return sb.toString();
}

/**
*过滤头信息值中的非法字符,避免CRLF注入攻击

* field-value = *( field-content | LWS )<br/>
*
* field-content = <the OCTETs making up the field-value<br/>
* and consisting of either *TEXT or combinations<br/>
* of token, tspecials, and quoted-string><br/>
*
* @param value
* @return
*/
private static String filterHeaderValue(String value) {
if (value == null || value.length() < 1) {
return "null";
}
StringBuilder sb = new StringBuilder(value.length());
for (int i = 0; i < value.length(); i++) {
char c = value.charAt(i);
if (c >= 32 && c < 127) {
sb.append(c);
}
}
return sb.toString();
}

public static void main(String[] args) {
String headName = "aaaa aaa\r\n bbb ";
String headvalue = "cccccccccc\r\n ddd";
System.out.println(headName + "==" + filterHeaderName(headName));
System.out.println(headvalue + "==" + filterHeaderValue(headvalue));
}
}
[/code]

参考链接:

http://www.ietf.org/rfc/rfc2068.txt

http://www.acunetix.com/websitesecurity/crlf-injection.htm

http://comic.sjtu.edu.cn/bbs/view.asp?TID=4118

Tags: , , , ,