public class XmlHtmlHandler
extends java.lang.Object
Constructor and Description |
---|
XmlHtmlHandler() |
Modifier and Type | Method and Description |
---|---|
static java.lang.String |
decodeTextOther(java.lang.String xmlStr)
Decode the string to re-add some of the coded symbols
& , when for some reason
* the parsing process amy have missed this in the text. |
static java.lang.String |
decodeXml(java.lang.String xmlStr)
Decode the whole string-based XML document, to re-add the symbols.
|
static java.lang.String |
decodeXmlContent(java.lang.String xmlStr)
Decode the string-based XML content to re-add some of the coded symbols.
|
static java.lang.String |
decodeXmlTag(java.lang.String xmlStr)
Decode the string-based XML tag symbols only, to re-add the XML element coding.
|
static java.lang.String |
encodeTextOther(java.lang.String xmlStr)
Code the string text to replace some of the symbols
& with codes, when for some reason
the parsing process amy have missed this in the text. |
static java.lang.String |
encodeXml(java.lang.String xmlStr)
Code the whole string-based XML document.
|
static java.lang.String |
encodeXmlContent(java.lang.String xmlStr)
Code the xml-based string content to replace some of the text with symbols.
|
static java.lang.String |
encodeXmlTag(java.lang.String xmlStr)
Code the xml-based element tags only, to replace the element brackets only.
|
static java.util.ArrayList<java.lang.String> |
getAnchors(java.lang.String theText)
Get a list of all anchors in the text.
|
static java.lang.String |
getBodyPart(java.lang.String theText)
Parse the text and return only the body part.
|
static java.util.ArrayList<java.lang.String> |
getHtmlFormatList()
Get the list of format tags to remove from any html text.
|
static java.util.ArrayList<java.lang.String> |
getHtmlFormatWithDescriptionList()
Get the list of format tags to remove from any html text.
|
static java.util.ArrayList<java.lang.String> |
getHtmlWhitespaceList()
Get the list of whitespace format tags to remove from any html text.
|
static java.lang.String |
getNameAttr(org.licas_xml.abs.Element xmlElem)
Get the value of a name element in the xml.
|
static java.lang.String |
getNextWholeTag(java.lang.String theText)
Get the next whole tag that is contained in the text string.
|
static java.lang.String |
getTagName(java.lang.String theTag)
Get the name part of the tag only, without a prefix or brackets.
|
static java.lang.String |
getTitle(java.lang.String theText)
Get the title of the document.
|
static java.lang.String |
innerText(org.licas_xml.abs.Element rootElem)
Parse the root element to return a concatenated text of all nested elements.
|
static boolean |
isEndHtmlContentTag(java.lang.String theTag)
Return true if the tag entered is the end of an html content tag.
|
static boolean |
isHyperLink(java.lang.String httpStr)
Return true if the text entered starts with an http address protocol.
|
static boolean |
isHyperLinkTag(java.lang.String theTag)
Return true if the tag entered is potentially a hyperlink tag.
|
static boolean |
isRestQuery(java.lang.String httpStr)
Return true if the text entered starts with an http address protocol.
|
static boolean |
isStartHtmlContentTag(java.lang.String theTag)
Return true if the tag entered is the start of an html content tag.
|
static java.lang.String |
removeCDATA(java.lang.String theText)
Remove any CDATA section coding and return the text content only.
|
static java.lang.String |
removeCommented(java.lang.String theText)
|
static java.lang.String |
removeFooter(java.lang.String theText)
Remove the footer section of the document, defined as anything after the
body section. |
static java.lang.String |
removeFormatting(java.lang.String theText)
Remove html and other coded formatting, but replace some of them with the actual characters.
|
static java.lang.String |
removeHead(java.lang.String theText)
Remove the head section of the document.
|
static java.lang.String |
removeHex(java.lang.String theText)
Remove some hex number characters and replace with blanks.
|
static java.lang.String |
removeNextWholeTag(java.lang.String theText)
Remove the next whole tag that is contained in the text string.
|
static java.util.ArrayList<java.lang.String> |
removeNoScripts(java.util.ArrayList<java.lang.String> textLines)
Remove any no-script parts from the text and return.
|
static java.util.ArrayList<java.lang.String> |
removeOrderedLists(java.util.ArrayList<java.lang.String> textLines)
Remove any ordered list parts from the text and return.
|
static java.util.ArrayList<java.lang.String> |
removeScripts(java.util.ArrayList<java.lang.String> textLines)
Remove any script parts from the text and return.
|
static java.util.ArrayList<java.lang.String> |
removeStyles(java.util.ArrayList<java.lang.String> textLines)
Remove any style parts from the text and return.
|
static java.lang.String |
removeTagBracketed(java.lang.String theText)
Remove any bracketed section coding inside a tag and return the rest.
|
static java.util.ArrayList<java.lang.String> |
removeUnorderedLists(java.util.ArrayList<java.lang.String> textLines)
Remove any unordered list parts from the text and return.
|
static java.lang.String |
removeVoidTags(java.lang.String theText)
Remove void elements from the text.
|
static java.lang.String |
removeXmlCoded(java.lang.String theText)
Remove the xml coded content that starts with
& and ends with ; . |
static java.lang.String |
removeXmlTags(java.lang.String theText)
Remove the xml element tags from the text and keep only the xml element content.
|
static java.util.ArrayList<java.lang.String> |
removeXmlTagsList(java.lang.String theText)
Remove the xml element tags from the text and keep only the xml element content.
|
static java.lang.String |
repairXmlTags(java.lang.String theText,
java.lang.String rootTag)
Deprecated.
should not need to use this method.
|
public static boolean isStartHtmlContentTag(java.lang.String theTag)
theTag
- the tag to check.public static boolean isEndHtmlContentTag(java.lang.String theTag)
theTag
- the tag to check.public static boolean isHyperLinkTag(java.lang.String theTag)
theTag
- the tag to check.public static boolean isHyperLink(java.lang.String httpStr)
httpStr
- the http string to check.public static boolean isRestQuery(java.lang.String httpStr)
httpStr
- the http string to check.public static java.lang.String getNameAttr(org.licas_xml.abs.Element xmlElem)
xmlElem
- the element to check.public static java.lang.String getTitle(java.lang.String theText)
theText
- the text representation of the html document.public static java.lang.String removeHead(java.lang.String theText)
theText
- the text representation of the html document.</head>
is removed.public static java.lang.String removeFooter(java.lang.String theText)
body
section.theText
- the text representation of the html document.</body>
is removed.public static java.lang.String getBodyPart(java.lang.String theText)
theText
- the text to parse.public static java.lang.String repairXmlTags(java.lang.String theText, java.lang.String rootTag)
theText
- the text representation of the XML document.rootTag
- tag name only to start with, in case the XMl has headers.
In general, set to null
for the whole text.public static java.lang.String removeVoidTags(java.lang.String theText)
'/'
, indicator.theText
- the input text.public static java.util.ArrayList<java.lang.String> removeStyles(java.util.ArrayList<java.lang.String> textLines)
textLines
- lines of text to parse.public static java.util.ArrayList<java.lang.String> removeScripts(java.util.ArrayList<java.lang.String> textLines)
textLines
- lines of text to parse.public static java.util.ArrayList<java.lang.String> removeNoScripts(java.util.ArrayList<java.lang.String> textLines)
textLines
- lines of text to parse.public static java.util.ArrayList<java.lang.String> removeOrderedLists(java.util.ArrayList<java.lang.String> textLines)
textLines
- lines of text to parse.public static java.util.ArrayList<java.lang.String> removeUnorderedLists(java.util.ArrayList<java.lang.String> textLines)
textLines
- lines of text to parse.public static java.util.ArrayList<java.lang.String> getAnchors(java.lang.String theText)
theText
- the text representation of the XML document.public static java.lang.String removeCommented(java.lang.String theText)
HtmlConst
.COMMSTART
and COMMEND
.
Each tag is replaced by a space. This method does not require a valid XML document.theText
- the text representation of the XML document.public static java.lang.String removeTagBracketed(java.lang.String theText)
theText
- the text representation of the XML document.'[', ']'
section has been removed.public static java.lang.String removeCDATA(java.lang.String theText)
theText
- the text representation of the XML document.public static java.lang.String removeXmlTags(java.lang.String theText)
theText
- the text representation of the XML document.public static java.util.ArrayList<java.lang.String> removeXmlTagsList(java.lang.String theText)
theText
- the text representation of the XML document.public static java.lang.String innerText(org.licas_xml.abs.Element rootElem) throws java.lang.Exception
rootElem
- the element to parse from.java.lang.Exception
- any error.public static java.lang.String getNextWholeTag(java.lang.String theText)
theText
- the text representation of the XML document.public static java.lang.String removeNextWholeTag(java.lang.String theText)
theText
- the text representation of the XML document.public static java.lang.String getTagName(java.lang.String theTag)
theTag
- the text term to process.public static java.lang.String removeFormatting(java.lang.String theText)
br
html break statements, so you might want
to replace them first.theText
- the text representation of the html document.public static java.lang.String removeHex(java.lang.String theText)
theText
- the text representation of the html document.public static java.lang.String removeXmlCoded(java.lang.String theText) throws java.lang.Exception
&
and ends with ;
.theText
- the text representation of the xml document.java.lang.Exception
- any error.public static java.util.ArrayList<java.lang.String> getHtmlFormatList()
public static java.util.ArrayList<java.lang.String> getHtmlWhitespaceList()
public static java.util.ArrayList<java.lang.String> getHtmlFormatWithDescriptionList()
public static java.lang.String encodeXml(java.lang.String xmlStr)
xmlStr
- the string-based version of the XML document.>
instead of >
.public static java.lang.String decodeXml(java.lang.String xmlStr)
xmlStr
- the string-based version of the XML document.>
instead of >
.public static java.lang.String encodeXmlTag(java.lang.String xmlStr)
xmlStr
- the string with tag brackets.>
instead of >
.public static java.lang.String decodeXmlTag(java.lang.String xmlStr)
xmlStr
- the string with brackets coded, for example, >
instead of >
.public static java.lang.String encodeXmlContent(java.lang.String xmlStr)
xmlStr
- the string content, which is the element text.public static java.lang.String decodeXmlContent(java.lang.String xmlStr)
xmlStr
- the string content, which is the element text.public static java.lang.String encodeTextOther(java.lang.String xmlStr)
&
with codes, when for some reason
the parsing process amy have missed this in the text.xmlStr
- the string content.public static java.lang.String decodeTextOther(java.lang.String xmlStr)
&
, when for some reason
* the parsing process amy have missed this in the text.xmlStr
- the string content.