文章关键字 ‘Java’

使用epub-tools抓取网页生成epub电子书,用epubcheck进行校验

2010年03月4日,星期四

最近在看epub格式的电子书,于是对epub格式的电子书有一点了解。

然后下载了epub-tools和epubcheck的代码,写了个小程序,通过分析网上小说主页和章节内容,然后生成epub格式的电子书。

epub-tools 来源:http://code.google.com/p/epub-tools/

epubcheck来源:http://code.google.com/p/epubcheck/

部分代码如下:

————————————————–

package com.lizongbo.epub;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;

import com.adobe.dp.epub.io.DataSource;

/**

*网上图片的数据源

*/

public class ImgFileUrlDataSource extends DataSource {

String url;

public ImgFileUrlDataSource(String url) {
this.url = url;
}

public InputStream getInputStream() throws IOException {
return new URL(url).openStream();
}

}

————————————————–

package com.lizongbo.epub;

import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;

import com.adobe.dp.epub.io.BufferedDataSource;
import com.adobe.dp.epub.io.OCFContainerWriter;
import com.adobe.dp.epub.io.StringDataSource;
import com.adobe.dp.epub.ncx.TOCEntry;
import com.adobe.dp.epub.opf.BitmapImageResource;
import com.adobe.dp.epub.opf.NCXResource;
import com.adobe.dp.epub.opf.OPSResource;
import com.adobe.dp.epub.opf.Publication;
import com.adobe.dp.epub.opf.Resource;
import com.adobe.dp.epub.ops.Element;
import com.adobe.dp.epub.ops.OPSDocument;
import com.adobe.epubcheck.api.EpubCheck;
import com.adobe.epubcheck.api.Report;
import com.adobe.epubcheck.util.DefaultReportImpl;
import javax.imageio.ImageIO;

/**
* 从网上抓取网页下来,打包成epub
* 书目录url:
* 参考 http://code.google.com/p/epub-tools/wiki/HelloEPUB2
* http://code.google.com/p/epub-tools/w/list
*
* @author
*
*/
public class HtmlBook2epub {

/**
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String bookId = “618119″;//
makeEpub(bookId, “/home/lizongbo/tmp”);

}

/**
* 根据小说id生成对应的epub文件
*
* @param HtmlBookId
* @return
* @throws Exception
*/
public static boolean makeEpub(String HtmlBookId, String epubDir)
throws Exception {
String bookCatalogUrl = “http://book.com/?bookid=”+ HtmlBookId;
String bookCatalogHtml = downloadUrlContent(bookCatalogUrl);
String bookTitle = HtmlBook2epub.getBookTitle(bookCatalogHtml);
String bookAuthor = HtmlBook2epub.getBookAuthor(bookCatalogHtml);
String dir = “OEBPS”;
Publication epub = new Publication(dir);
// see http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html#Section2.1
// <title>: 题名 <creator> :责任者 <subject> :主题词或关键词 <description> :内容描述
// <contributor> :贡献者或其它次要责任者 <date> :日期 <type> :类型 <format> :格式
// <identifier> :标识符 <source> :来源 <language> :语种 <relation> :相关信息
// <coverage> :履盖范围 <rights> :权限描述
epub.addDCMetadata(“title”, bookTitle);//添加标题
epub.addDCMetadata(“creator”, bookAuthor);//添加书作者
addIntro(epub, HtmlBookId);// 添加简介
epub.addDCMetadata(“publisher”, “lizongbo”);
epub.addDCMetadata(“contributor”, “lizongbo”);
///epub.addDCMetadata(“date”, “”);
///epub.addDCMetadata(“type”, “”);
///epub.addDCMetadata(“format”, “lizongbo”);
epub.addDCMetadata(“identifier”, “Htmlbook_” + HtmlBookId);
epub.addDCMetadata(“source”, bookCatalogUrl);
epub.addDCMetadata(“language”, “zh”);
///epub.addDCMetadata(“ralation”, “”);
///epub.addDCMetadata(“coverage”, “无”);
epub.addDCMetadata(“rights”, “本书由lizongbo整理网页生成”);
epub.addMetadata(null, “cover”, “cover-image”);//添加蜂蜜图片的id
String[] chapterIds = getChapterIds(bookCatalogHtml, HtmlBookId);
addCoverImg(epub, HtmlBookId);// 添加封面和缩略图
for (int i = 0; i < 3000 && i < chapterIds.length; i++) {
addChapter(epub, HtmlBookId, chapterIds[i]);// 添加章节
}
File outFile = new File(epubDir, “Htmlbook_” + HtmlBookId + “.epub”);
OutputStream out = new FileOutputStream(outFile);
OCFContainerWriter container = new OCFContainerWriter(out);
epub.serialize(container);
checkEpub(outFile.getAbsolutePath());
return false;

}

/**
* 检查 epub书格式是否ok
*
* @param epubName
*/
public static void checkEpub(String epubName) {
Report report = new DefaultReportImpl(epubName);
if (!epubName.endsWith(“.epub”))
report.warning(null, 0, “filename does not include ‘.epub’ suffix”);

EpubCheck check = new EpubCheck(new File(epubName), report);
if (check.validate())
System.out.println(“No errors or warnings detected”);
else {
System.err.println(“\nCheck finished with warnings or errors!\n”);
}
}

/**
* 添加封面图片和缩略图
*
* @param epub
* @param HtmlBookId
* @throws Exception
*/
public static void addCoverImg(Publication epub, String HtmlBookId)
throws Exception {
String bookUrl = “http://book.com/index_” + HtmlBookId + “.htm”;
String bookHtml = downloadUrlContent(bookUrl);
String coverImgUrl = getCoverImgUrl(bookHtml);
BitmapImageResource coverImg = epub.createBitmapImageResource(epub
.getContentFolder()
+ “/images/cover.jpg”, “image/jpeg”, new ImgFileUrlDataSource(
coverImgUrl));
coverImg.setId(“cover-image”);
// 还需要把封面图片转成缩略图 thumb.png
BufferedDataSource thumbDs = new BufferedDataSource();
BufferedImage bi = ImageIO.read(new URL(coverImgUrl));// 读到原图
BufferedImage tag = null;
tag = new BufferedImage(54, 75, BufferedImage.TYPE_INT_RGB);
tag.getGraphics().drawImage(bi, 0, 0, 54, 75, null); // 绘制缩小后的图
ImageIO.write(tag, “png”, thumbDs.getOutputStream());
BitmapImageResource thumbImg = epub.createBitmapImageResource(epub
.getContentFolder()
+ “/images/thumb.png”, “image/png”, thumbDs);
}

/**
* 根据章节id添加章节
*
* @param epub
* @param chapterUrl
*            章节url
* @throws Exception
*/
public static void addChapter(Publication epub, String HtmlBookId,
String chapterId) throws Exception {
String chapterUrl = “http://book.com/book/chapter_
+ HtmlBookId + “_” + chapterId + “.html”;
String chapterHtml = downloadUrlContent(chapterUrl);
String chapterTitle = HtmlBook2epub.getChapterTitle(chapterHtml);
String chapterText = HtmlBook2epub.getChapterText(chapterHtml);
chapterText = chapterText.replaceAll(“</p><p>”, “\n”);
chapterText = chapterText.replaceAll(“<p>”, “”);
chapterText = html2txt(chapterText.replaceAll(“</p>”, “”)).trim();
String chapterTextArr[] = chapterText.split(“\n”);
addChapter(epub, HtmlBookId, chapterId, chapterTitle, chapterTextArr);

}

/**根据章节内容添加章节
* @param epub
* @param chapterId
* @param title
* @param texts
*/
public static void addChapter(Publication epub, String HtmlBookId,
String chapterId, String title, String[] texts) {
if (texts == null || texts.length < 1) {
System.out.println(“warn: ” + HtmlBookId + “|” + chapterId + “|”
+ title + ” texts is empty”);
return;
}
if (title == null || title.length() < 1) {
System.out.println(“warn: ” + HtmlBookId + “|” + chapterId + “|”
+ title + ” title is empty”);
return;
}
NCXResource toc = epub.getTOC();
TOCEntry rootTOCEntry = toc.getRootTOCEntry();
String chapterFile = epub.getContentFolder() + “/” + chapterId
+ “.html”;
System.out.println(“addChapter ” + chapterFile + “|” + chapterId + “|”
+ title);
OPSResource chapter1 = epub.createOPSResource(chapterFile);
epub.addToSpine(chapter1);
OPSDocument chapter1Doc = chapter1.getDocument();
TOCEntry chapter1TOCEntry = toc.createTOCEntry(title, chapter1Doc
.getRootXRef());
rootTOCEntry.add(chapter1TOCEntry);
Element body1 = chapter1Doc.getBody();
Element header1 = chapter1Doc.createElement(“h1″);
header1.add(title);
body1.add(header1);
{// 添加原文来源:
String chapterUrl = “http://book.com/book/chapter_
+ HtmlBookId + “_” + chapterId + “.html”;
Element paragraph1 = chapter1Doc.createElement(“p”);
paragraph1.add(“原文来源:” + chapterUrl);
body1.add(paragraph1);
}
for (int i = 0; texts != null && i < texts.length; i++) {
Element paragraph1 = chapter1Doc.createElement(“p”);
paragraph1.add(texts[i]);
body1.add(paragraph1);
}

}

/**
* 添加小说简介
*
* @param epub
* @param HtmlBookId
* @throws Exception
*/
public static void addIntro(Publication epub, String HtmlBookId)
throws Exception {
String bookUrl = “http://book.com/book/index_” + HtmlBookId
+ “.html”;
String bookHtml = downloadUrlContent(bookUrl);
String startText = “<div >”;
String endText = “</div>”;
String intro = getStringLastBetween(bookHtml, startText, endText);
intro = intro.replaceAll(“<p>”, “”);
intro = html2txt(intro.replaceAll(“</p>”, “”));
intro = intro + “\n来源:” + bookUrl;
epub.addDCMetadata(“description”, intro);
Resource introRes = epub.createResource(“intro.txt”, “text/plain”,
new StringDataSource(intro));
startText = “<div class=\”linkOther\”>”;
endText = “</div>”;
String keywords = getStringLastBetween(bookHtml, startText, endText);
keywords = html2txt(keywords);
String ks[] = keywords.split(“\n”);
for (String s : ks) {
if (s != null && s.trim().length() > 0) {
epub.addDCMetadata(“subject”, s);//支持多个关键字
}
}

}

/**
* 用GB2312下载网页内容
*
* @param urlStr
* @return
* @throws Exception
*/
public static String downloadUrlContent(String urlStr) throws Exception {
return downloadUrlContent(urlStr, “GB2312″);
}

/**
* 根据章节内内容获得章节标题
*
* @param chapterHtml
* @return
*/
public static String getChapterTitle(String chapterHtml) {
String startText = “<h1>”;
String endText = “</h1>”;
return getStringLastBetween(chapterHtml, startText, endText);
}

/**
* 根据章节内容获取小说内容的html
*
* @param chapterHtml
* @return
*/
public static String getChapterText(String chapterHtml) {
String startText = “<div>”;
String endText = “</div>”;
return getStringLastBetween(chapterHtml, startText, endText);
}

/**
* 根据目录列表网页内容获取小说标题
*
* @param bookHtml
* @return
*/
public static String getBookTitle(String bookHtml) {
String startText = “<title>”;
String endText = “</title>”;
String title = getStringLastBetween(bookHtml, startText, endText);
if (title.contains(“_”)) {
title = title.substring(0, title.indexOf(“_”));
}
return html2txt(title);
}

/**
* 根据目录列表网页内容获取作者名称
*
* @param bookHtml
* @return
*/
public static String getBookAuthor(String bookHtml) {
String startText = “<h1>”;
String endText = “</h1>”;
String title = getStringLastBetween(bookHtml, startText, endText);
startText = “<span>”;
endText = “</span>”;
title = getStringLastBetween(title, startText, endText);
System.out.println(“getBookAuthor==” + title);
return title.length() > 0 ? html2txt(title) : “无名”;
}

/**
* 根据目录页面网页内容获得章节Id
*
* @param bookHtml
* @return
*/
public static String[] getChapterIds(String bookHtml, String HtmlBookId) {
java.util.List<String> chapterList = new ArrayList<String>();
String startText = “<a href=\”c_” + HtmlBookId + “_”;
String endText = “.html\”";
String chapterId = null;
while ((chapterId = getStringBetween(bookHtml, startText, endText))
.length() > 0) {
System.out.println(“chapterId==” + chapterId);
chapterList.add(chapterId);
bookHtml = bookHtml.substring(bookHtml.indexOf(startText)
+ startText.length());
}

return chapterList.toArray(new String[0]);
}

/**
* 根据小说首页html,提取封面图片路径
*
* @param bookHtml
* @return
*/
public static String getCoverImgUrl(String bookHtml) {
String startText = “http://book.com/cover”;
String endText = “.jpg”;
String url = getStringBetween(bookHtml, startText, endText);
url = startText + url + endText;
System.out.println(“getCoverImgUrl==” + url);
return url;
}

/**
* 获取文本中最后一次出现在两个字符串之间的文字,不包含开头和结尾的字符串
*
* @param src
* @param startText
* @param endText
* @return
*/
public static String getStringLastBetween(String src, String startText,
String endText) {
if (src != null && src.contains(startText)) {
int startIndex = src.lastIndexOf(startText);
int endIndex = src.indexOf(endText, startIndex);
if (endIndex > startIndex) {
return src.substring(startIndex + startText.length(), endIndex);

}
}
return “”;

}

/**
* 获取文本中第一次出现在两个字符串之间的文字,不包含开头和结尾的字符串
*
* @param src
* @param startText
* @param endText
* @return
*/
public static String getStringBetween(String src, String startText,
String endText) {
if (src != null && src.contains(startText)) {
int startIndex = src.indexOf(startText);
int endIndex = src.indexOf(endText, startIndex);
if (endIndex > startIndex) {
return src.substring(startIndex + startText.length(), endIndex);

}
}
return “”;

}

/**
* 用指定编码下载网页内容
*
* @param urlStr
* @param encoding
* @return
* @throws Exception
*/
public static String downloadUrlContent(String urlStr, String encoding)
throws Exception {
URL url = new URL(urlStr);
URLConnection urlc = url.openConnection();
urlc
.setRequestProperty(
“User-Agent”,
“Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2) Gecko/20100115 Firefox/3.6″);
urlc
.setRequestProperty(“Accept”,
“text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8″);
urlc.setRequestProperty(“Accept-Language”, “zh-cn,zh;q=0.5″);
urlc.setRequestProperty(“Accept-Charset”, “GB2312,utf-8;q=0.7,*;q=0.7″);
urlc.setConnectTimeout(5000);
urlc.connect();
StringBuilder sb = new StringBuilder(4096);
BufferedReader in = new BufferedReader(new InputStreamReader(urlc
.getInputStream(), encoding));
String line;
while ((line = in.readLine()) != null) {
sb.append(line).append(‘\n’);
}
in.close();
System.out.println(urlStr);
return sb.toString().trim();
}
/**
*提取html的文本内容
*/
public static String html2txt(String s) {
if (s != null) {
return s.replaceAll(“<.*?>”, “”);
}
return “”;
}
}

Tags: ebook, epub, Java

Related posts

使用Servlet Filter来防止Xss漏洞和SQL注入的方法

2010年02月23日,星期二

使用Servlet Filter来防止Xss漏洞和SQL注入的方法

在用java进行web业务开发的时候,对于页面上接收到的参数,除了极少数是步可预知的内容外,大量的参数名和参数值都是不会出现触发Xss漏洞的字符。而通常为了避免Xss漏洞,都是开发人员各自在页面输出和数据入库等地方加上各种各样的encode方法来避免Xss问题。而由于开发人员的水平不一,加上在编写代码的过程中安全意识的差异,可能会粗心漏掉对用户输入内容进行encode处理。针对这种大量参数是不可能出现引起Xss和SQL注入漏洞的业务场景下,因此可以使用一个适用大多数业务场景的通用处理方法,牺牲少量用户体验,来避免Xss漏洞和SQL注入。
那就是利用Servlet的过滤器机制,编写定制的XssFilter,将request请求代理,覆盖getParameter和getHeader方法将参数名和参数值里的指定半角字符,强制替换成全角字符。
使得在业务层的处理时不用担心会有异常输入内容。

相关的代码如下:
XssFilter.java
[code]
package com.lizongbo.filter;

import java.io.IOException;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;

public class XssFilter implements Filter {

@Override
public void init(FilterConfig config) throws ServletException {
}

@Override
public void doFilter(ServletRequest request, ServletResponse response,
FilterChain chain) throws IOException, ServletException {
XssHttpServletRequestWrapper xssRequest = new XssHttpServletRequestWrapper(
(HttpServletRequest) request);
chain.doFilter(xssRequest, response);
}

@Override
public void destroy() {
}
}

[/code]

XssHttpServletRequestWrapper.java
[code]
package com.lizongbo.filter;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletRequestWrapper;

/**
*
* @author lizongbo
*
*/
public class XssHttpServletRequestWrapper extends HttpServletRequestWrapper {
HttpServletRequest orgRequest = null;

public XssHttpServletRequestWrapper(HttpServletRequest request) {
super(request);
orgRequest = request;
}

/**
* 覆盖getParameter方法,将参数名和参数值都做xss过滤。<br/>
* 如果需要获得原始的值,则通过super.getParameterValues(name)来获取<br/>
* getParameterNames,getParameterValues和getParameterMap也可能需要覆盖
*/
@Override
public String getParameter(String name) {
String value = super.getParameter(xssEncode(name));
if (value != null) {
value = xssEncode(value);
}
return value;
}

/**
* 覆盖getHeader方法,将参数名和参数值都做xss过滤。<br/>
* 如果需要获得原始的值,则通过super.getHeaders(name)来获取<br/>
* getHeaderNames 也可能需要覆盖
*/
@Override
public String getHeader(String name) {

String value = super.getHeader(xssEncode(name));
if (value != null) {
value = xssEncode(value);
}
return value;
}

/**
* 将容易引起xss漏洞的半角字符直接替换成全角字符
*
* @param s
* @return
*/
private static String xssEncode(String s) {
if (s == null || s.isEmpty()) {
return s;
}
StringBuilder sb = new StringBuilder(s.length() + 16);
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
switch (c) {
case '>':
sb.append('>');//全角大于号
break;
case '<':
sb.append('<');//全角小于号
break;
case '\'':
sb.append('‘');//全角单引号
break;
case '\"':
sb.append('“');//全角双引号
break;
case '&':
sb.append('&');//全角
break;
case '\\':
sb.append('\');//全角斜线
break;
case '#':
sb.append('#');//全角井号
break;
default:
sb.append(c);
break;
}
}
return sb.toString();
}

/**
* 获取最原始的request
*
* @return
*/
public HttpServletRequest getOrgRequest() {
return orgRequest;
}
/**
* 获取最原始的request的静态方法
*
* @return
*/
public static HttpServletRequest getOrgRequest(HttpServletRequest req) {
if(req instanceof XssHttpServletRequestWrapper){
return ((XssHttpServletRequestWrapper)req).getOrgRequest();
}

return req;
}
}

}
[/code]

Tags: filter, Java, servlet, SQL, Xss

Related posts

Windows下编写google app engine jsp出现中文乱码问题的真正原因

2009年04月12日,星期天

Windows下编写google app engine jsp出现中文乱码问题的真正原因,是com.google.appengine.tools.admin.Application里启动javac没加-encoding参数导致.

在Linux下,因为一般都是LANG=zh_CN.UTF-8,因此就不会遇到这个问题了。

Windows下一般是取的默认的file.encoding为GBK,因此根据jsp生成的class文件就是乱码内容了。

验证流程如下:
1.首先重命名E:\Java\appengine-java-sdk-1.2.0\lib\shared\jsp\jasper-compiler-5.0.28.jar为 jasper-compiler-5.0.28.jar.bak。
然后在E:\Java\appengine-java-sdk-1.2.0\bin下运行 appcfg update E:\Java\workspace\testwe\war。
这个时候在命令行下会看到如下的出错信息。
E:\Java\appengine-java-sdk-1.2.0\bin>java -cp “E:\Java\appengine-java-sdk-1.2.0\bin\\..\lib\appengine-tools-api.jar” com.google.appengine.tools.admin.AppCfg update e:\Java\workspace\testweb\war
Reading application configuration data…
2009-04-12 03:05:34.240::INFO:  Logging to STDERR via org.mortbay.log.StdErrLog
Beginning server interaction for lizongbo…
0% Creating staging directory
5% Scanning for jsp files.
8% Compiling jsp files.
Exception in thread “main” java.lang.NoClassDefFoundError: org/apache/jasper/JspC
at com.google.appengine.tools.development.LocalJspC.main(LocalJspC.java:
14)
Caused by: java.lang.ClassNotFoundException: org.apache.jasper.JspC
at java.net.URLClassLoader$1.run(URLClassLoader.java:200)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:188)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:252)
at java.lang.ClassLoader.loadClassInternal(ClassLoader.java:320)
… 1 more
java.lang.RuntimeException: Failed to compile jsp files via E:\Java\jdk1.6.0_13\
jre\bin\java.exe -classpath com.google.appengine.tools.development.LocalJspC -uriroot C:\DOCUME
~1\ADMINI~1\LOCALS~1\Temp\appcfg8338201102819535049.tmp -p org.apache.jsp -l -v
-webinc C:\DOCUME~1\ADMINI~1\LOCALS~1\Temp\appcfg8338201102819535049.tmp\WEB-INF
\generated_web.xml -d C:\DOCUME~1\ADMINI~1\LOCALS~1\Temp\appcfg83382011028195350
49.tmp\WEB-INF\classes

2.将jasper-compiler-5.0.28.jar.bak重命名回 jasper-compiler-5.0.28.jar,
将E:\Java\jdk1.6.0_13\bin\javac.exe重命名为 javacc.exe,
在Eclipse里通过Google app Engine的插件发布,得到下面的出错信息:

Unable to upload:
java.lang.IllegalStateException: cannot find javac executable based on java.home, tried “E:\Java\jdk1.6.0_13\jre\bin\javac.exe” and “E:\Java\jdk1.6.0_13\bin\javac.exe”
at com.google.appengine.tools.admin.AppAdminFactory$ApplicationProcessingOptions.getJavaCompiler(AppAdminFactory.java:325)
at com.google.appengine.tools.admin.Application.compileJavaFiles(Application.java:340)
at com.google.appengine.tools.admin.Application.compileJsps(Application.java:326)
at com.google.appengine.tools.admin.Application.createStagingDirectory(Application.java:235)
at com.google.appengine.tools.admin.AppAdminImpl.update(AppAdminImpl.java:39)
at com.google.appengine.eclipse.core.proxy.AppEngineBridgeImpl.deploy(AppEngineBridgeImpl.java:203)
at com.google.appengine.eclipse.core.deploy.DeployProjectJob.runInWorkspace(DeployProjectJob.java:97)
at org.eclipse.core.internal.resources.InternalWorkspaceJob.run(InternalWorkspaceJob.java:38)
at org.eclipse.core.internal.jobs.Worker.run(Worker.java:55)
java.lang.IllegalStateException: cannot find javac executable based on java.home, tried “E:\Java\jdk1.6.0_13\jre\bin\javac.exe” and “E:\Java\jdk1.6.0_13\bin\javac.exe”
于是根据堆栈找到了com.google.appengine.tools.admin.Application,使用Jd-gui(来自http://java.decompiler.free.fr/)打开class找到了如下代码:
[code]
String javacCmd = opts.getJavaCompiler().getPath();
args.add(javacCmd);
args.add("-classpath");
args.add(classpath.toString());
args.add("-d");
args.add(classDir.getPath());

for (File f : new FileIterator(classDir)) {
if (f.getPath().toLowerCase().endsWith(".java"))
args.add(f.getPath());

}

if (args.size() == 5)
return;

Process javac = startProcess((String[])args.toArray(new String[0]));
[/code]

从代码看到,Google App engine只用 apache Tomcat jasper生成java文件,再通过拼凑命令行字符串方式生成编译java文件为Class的命令,通过Process执行。
由于命令行参数没有设置encoding参数,于是javac使用了系统默认的file.encoding,在中文Windows下,基本都是GBK。在Linux下,因为一般都是LANG=zh_CN.UTF-8,因此就不会遇到这个问题了。
下面是linux 下ps看到的完整javac命令:

lizongbo  6733  0.0  0.4 690156  9228 ?        Sl   21:16   0:00 /usr/local/jdk1.6.0_12/bin/javac -classpath /usr/local/java/appengine-java-sdk-1.2.0/lib/impl/appengine-api.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/impl/appengine-api-stubs.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/impl/appengine-local-runtime.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/geronimo-el_1.0_spec-1.0.1.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/geronimo-servlet_2.5_spec-1.2.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/appengine-local-runtime-shared.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/geronimo-jsp_2.1_spec-1.0.1.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/jsp/jasper-runtime-5.0.28.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/jsp/commons-el-1.0.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/jsp/ant-launcher-1.6.5.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/jsp/commons-logging-1.1.1.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/jsp/ant-1.6.5.jar:/usr/local/java/appengine-java-sdk-1.2.0/lib/shared/jsp/jasper-compiler-5.0.28.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/classes:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/ant-launcher-1.6.5.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/jakarta-standard-1.1.2.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/jasper-compiler-5.0.28.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/datanucleus-appengine-1.0.0.final.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/ant-1.6.5.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/datanucleus-core-1.1.0.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/geronimo-jta_1.1_spec-1.1.1.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/jdo2-api-2.3-SNAPSHOT.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/jasper-runtime-5.0.28.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/commons-logging-1.1.1.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/appengine-api-1.0-sdk-1.2.0.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/geronimo-jpa_3.0_spec-1.1.1.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/datanucleus-jpa-1.1.0.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/commons-el-1.0.jar:/tmp/appcfg8102867646621343748.tmp/WEB-INF/lib/jakarta-jstl-1.1.2.jar: -d /tmp/appcfg8102867646621343748.tmp/WEB-INF/classes /tmp/appcfg8102867646621343748.tmp/WEB-INF/classes/org/apache/jsp/test_jsp.java /tmp/appcfg8102867646621343748.tmp/WEB-INF/classes/org/apache/jsp/p_jsp.java

在Google app Engine的java SDK没有完善之前,只有在Ubuntu下写jsp最方便不会出现中文乱码问题。
受网上文章相关文章的误导,花了不少时间去看 apache tomcat japser的代码,并尝试替换 jasper-compiler-5.0.28.jar为Apache tomcat 6.0.18的最新版本,结果发现乱码问题的原因根本不在哪里。

appengine_web.xml里可以配置:<property name=”file.encoding” value=”UTF-8″ />。但是也不能解决乱码问题。

Tags: encoding, Google App Engine, Java, jsp, jspc

Related posts