package pt.ist.renderers.extensions.htmlEditor;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import pt.ist.renderers.components.converters.ConversionException;
import com.tecnick.htmlutils.htmlentities.HTMLEntities;
/**
* This converter converts an HTML fragment to plain text while preserving some
* of the formatting like paragraphs, lists, quotations, smiles, etc.
*
* @author cfgi
*/
public class HtmlToTextConverter extends TidyConverter {
private static final String DEFAULT_INDENT = " ";
private StringBuilder buffer;
private int pos;
private boolean wrap;
private int lineLength;
public HtmlToTextConverter() {
super();
this.pos = 0;
this.buffer = new StringBuilder();
this.wrap = true;
this.lineLength = 80;
}
public int getLineLength() {
return this.lineLength;
}
/**
* Sets the line length used when wrapping text. This value is ignored if
* {@link #isWrap()} returns false
.
*/
public void setLineLength(int lineLength) {
this.lineLength = lineLength;
}
/**
* If this converter is wrapping text acording to the line length specified
* with {@link #setLineLength(int)}.
*/
public boolean isWrap() {
return this.wrap;
}
/**
* Chooses wether this converter should do line wrapping or not.
*/
public void setWrap(boolean wrap) {
this.wrap = wrap;
}
@Override
protected void parseDocument(OutputStream outStream, Tidy tidy, Document document) {
tidy.setPrintBodyOnly(false);
parseNode(tidy, document, "");
try {
Writer writer = new OutputStreamWriter(outStream);
writer.write(this.buffer.toString());
writer.flush();
} catch (IOException e) {
e.printStackTrace();
throw new ConversionException("renderers.converter.text.write");
}
}
private void parseNode(Tidy tidy, Node node, String indent) {
switch (node.getNodeType()) {
case Node.DOCUMENT_NODE:
parseNodeChildren(tidy, node, indent);
break;
case Node.ELEMENT_NODE:
Element element = (Element) node;
String name = element.getNodeName().toLowerCase();
if (name.equals("p")) {
ensureBlankLine();
addCodeText(indent);
parseNodeChildren(tidy, element, indent);
ensureBlankLine();
addCodeText(indent);
} else if (name.equals("blockquote")) {
ensureBlankLine();
addCodeText(indent + DEFAULT_INDENT);
parseNodeChildren(tidy, element, indent + DEFAULT_INDENT);
ensureBlankLine();
addCodeText(indent);
} else if (name.equals("ul") || name.equals("ol")) {
ensureLineBreak();
parseList(tidy, element, name.equals("ol"), indent);
ensureLineBreak();
addCodeText(indent);
} else if (name.equals("br")) {
addLineBreak();
addCodeText(indent);
} else if (name.equals("hr")) {
ensureLineBreak();
addText("----------", indent);
ensureLineBreak();
addCodeText(indent);
} else if (name.equals("pre")) {
ensureBlankLine();
addCodeText(indent);
addCodeText(getChildTextContent(tidy, element));
ensureBlankLine();
addCodeText(indent);
} else if (name.equals("code")) {
addCodeText(getChildTextContent(tidy, element));
} else if (name.equals("a")) {
parseNodeChildren(tidy, element, indent);
addText("(" + element.getAttribute("href") + ")", indent);
} else if (name.equals("img")) {
parseSmile(tidy, element, indent);
} else {
parseNodeChildren(tidy, node, indent);
}
break;
case Node.TEXT_NODE:
addText(getTextContent(tidy, node), indent);
break;
default:
break;
}
}
private void parseList(Tidy tidy, Element element, boolean ordered, String indent) {
NodeList itemList = element.getChildNodes();
for (int i = 0; i < itemList.getLength(); i++) {
Node item = itemList.item(i);
if (item.getNodeType() != Node.ELEMENT_NODE || !item.getNodeName().equalsIgnoreCase("li")) {
continue;
}
addCodeText(indent + DEFAULT_INDENT);
addText(ordered ? String.valueOf(i + 1) + ". " : "* ", indent);
parseNodeChildren(tidy, item, indent + DEFAULT_INDENT);
addLineBreak();
}
}
private void parseSmile(Tidy tidy, Element element, String indent) {
String source = element.getAttribute("src");
if (source == null) {
return;
}
if (!source.matches(".*?smiley-[^.]+\\.gif")) { // TODO: check this
// convention
return;
}
int indexStart = source.lastIndexOf("smiley-") + "smiley-".length();
int indexEnd = source.lastIndexOf(".");
String smiley = source.substring(indexStart, indexEnd);
String emoticon = EmoticonMap.getEmoticon(smiley);
if (emoticon != null) {
addText(emoticon, indent);
}
}
private String getTextContent(Tidy tidy, Node node) {
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
tidy.pprint(node, outStream);
try {
outStream.flush();
} catch (IOException e) {
e.printStackTrace();
throw new ConversionException("renderers.converter.text.write");
}
return new String(outStream.toByteArray());
}
private String getChildTextContent(Tidy tidy, Node node) {
StringBuilder builder = new StringBuilder();
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
builder.append(getTextContent(tidy, children.item(i)));
}
return builder.toString();
}
private void parseNodeChildren(Tidy tidy, Node node, String indent) {
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
parseNode(tidy, children.item(i), indent);
}
}
private void addText(String htmlText, String indent) {
if (htmlText == null) {
return;
}
String text = unescapeHtml(htmlText);
String[] words = text.split("\\p{Space}+");
for (int i = 0; i < words.length; i++) {
String word = words[i];
if (word.length() == 0) {
continue;
}
if (pos + word.length() + 1 > getLineLength()) {
buffer.append("\n" + indent);
this.buffer.append(word + " ");
pos = indent.length() + word.length() + 1;
} else {
this.buffer.append(word + " ");
pos += word.length() + 1;
}
}
}
private String unescapeHtml(String htmlText) {
String text = htmlText;
text = HTMLEntities.unhtmlentities(text);
text = HTMLEntities.unhtmlAmpersand(text);
text = HTMLEntities.unhtmlAngleBrackets(text);
text = HTMLEntities.unhtmlQuotes(text);
return text;
}
private void addCodeText(String htmlText) {
if (htmlText == null) {
return;
}
String text = unescapeHtml(htmlText);
this.buffer.append(text);
pos += text.length() + 1;
}
private void addLineBreak() {
buffer.append("\n");
pos = 0;
}
private void ensureLineBreak() {
if (buffer.length() == 0) {
return;
}
if (buffer.lastIndexOf("\n") == buffer.length() - 1) {
return;
}
addLineBreak();
}
private void ensureBlankLine() {
if (buffer.length() == 0) {
return;
}
ensureLineBreak();
if (buffer.lastIndexOf("\n\n") == buffer.length() - 2) {
return;
}
addLineBreak();
}
}