Guess it would help to attach the file, huh ? :)
Doug Hendricks wrote:
Until Rob's goodies are available, the attached string/stream friendly
HTML/XML-Stripper (Cache 5.x) class might prove useful. Use the
HTMLToText Method to achieve what you want. It even reformats the
remaining text to the desired line length.
Doug Hendricks
Rob Tweed wrote:
Watch out for something interesting coming out soon that may help in
this kind of scenario - our HTML to XHTML converter. This is a core
subsystem that now forms the heart of a whole bunch of techonlogies
and solutions I've been working on recently. Provided your starting
point is HTML, albeit that it may include JavaScript, CSS, PHP script,
COS within <script> tags, it can all be converted to XHTML, and once
in that format, it can be transformed in any way you like using DOM
API methods.
Rob
On Thu, 08 Jul 2004 23:14:59 -0400, Denver Braughler
<[EMAIL PROTECTED]> wrote:
does anybody know how to strip out html tags from a text using regexp
If Cach� has regular expressions, that is good news to me.
(or anything else)
The DOM parser (Robb Tweed?) might help you depending on your purpose.
I use an edit utility classmethod that replaces from string1 to
string2 with string3 for every occurrence.
I strip everything from < to >, and replace specific &...;
occurrences with a single character.
You could start by stripping everything from <script> to </script>.
What the PHP code does is less general.
Where I have had numerous such edits, I have listed them in $text()
and iterated through them at runtime.
---
Rob Tweed
M/Gateway Developments Ltd
Global DOMination with eXtc : http://www.mgateway.com
---
--------------090501010904050803000208
Content-Type: text/xml;
name="StringTools.xml"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="StringTools.xml"
<?xml version="1.0" encoding="UTF-8"?>
<Export generator="Cache" version="9" zv="Cache for Windows NT (Intel/P4) 5.0.7 (Build 5000U)"
ts="2004-07-09 15:00:54">
<Class name="String.Tools">
<Abstract>1</Abstract>
<IncludeCode>%stringreplace</IncludeCode>
<ProcedureBlock>1</ProcedureBlock>
<TimeChanged>59701,18360.409814</TimeChanged>
<Method name="StripHTML">
<Description>
This method strips HTML Markup from whole or partial strings
(such as streams) where the string may in end between markup tags.</Description>
<ClassMethod>1</ClassMethod>
<FormalSpec><![CDATA[&str:%String,&Resume:%Boolean=0]]></FormalSpec>
<ReturnType>%Status</ReturnType>
<Implementation><![CDATA[
s left="<", right=">"
s l=$S(Resume:1,1:$F(str,left)),Resume=0
While l>0 {
s r=$F(str,right,l)
s:(l'=0)&&(r=0) r=$L(str)+1, Resume=1
s:l>0 $E(str,l-1,r-1)=""
s l=$F(str,left)
}
Q Resume
]]></Implementation>
</Method>
<Method name="HTMLToText">
<Description><![CDATA[
Method to strip <Markup>(HTML/XML) from an input string leaving the text values suitable for
plain/text viewing or file storage.<br>
This method also attempts to convert special escape sequences to their native text
character.<br>
StartElem and EndElem parameter values (if specified) are exclusive (only data between are
parsed).<br>
The CompareOp parameter (vbBinaryCompare =0, vbTextCompare =1) is only used when evaluating the
StartElem and EndElem string searches.]]></Description>
<ClassMethod>1</ClassMethod>
<FormalSpec><![CDATA[&stream:%Stream="",Compress:%Integer=0,StartElem:%String="",EndElem:%String="",CompareOp:%Integer=0]]></FormalSpec>
<Language>basic</Language>
<Implementation><![CDATA[
Const MAX_LINE_LENGTH = 75
Dim arysplit, i, j, strOut,gt,ct, pBuff,ReadBytes
Dim stylestart,styleend
Dim baseFilter
ReadBytesP00
baseFilter=Chr(0)
For i=1 to 31 'Non-printable character filter
if i <> 13 and i <> 10 then baseFilter = baseFilter & Chr(i)
Next
'Markup states
Dim inTR as Boolean
Dim inTD as Boolean
Dim inTABLE as Boolean
Dim inSCRIPT as Boolean
Dim inCOMMENT as Boolean
Dim inBODY as Boolean
Dim inSTYLE as Boolean
Dim inTITLE as Boolean
Dim inHEAD as Boolean
Dim inOL as Boolean
Dim inUL as Boolean
Dim inPROC as Boolean
Dim SUPRESS as Boolean
Dim PlaceInList as Integer
Dim cval as Integer
Dim chval as String
SUPRESS �lse
inOL�lse
inUL�lse
inSCRIPT�lse
inBODY�lse
inHEAD�lse
PlaceInList=0
pBuff=""
If IsObject(stream)=1 then
stream.Rewind()
if stream.AtEnd then Return
bytes= ReadBytes - Len(pBuff)
pBuff=""
strtext=pBuff & stream.Read(bytes)
else
strtext=stream
end if
Do While Len(strtext)>0
j=0
if Len(StartElem) > 0 Then
i = Instr(1,lcase(strtext),lcase(StartElem),CompareOp)
If i > 0 Then strtext=Mid(strtext,i+Len(StartElem))
End If
if Len(EndElem) > 0 Then
i = Instr(1,lcase(strtext),lcase(EndElem),CompareOp)
If i > 0 Then strtext=Left(strtext,i-1)
End If
';Handle Blocked <style ...></style> tags
styleend=-1
Do
styleend =InstrRev(LCase(strtext),"</style>",styleend,1)
if styleend > 0 then
stylestart =InstrRev(LCase(strtext),"<style",styleend,1)
if stylestart > 0 then
strtext = Left(strtext,stylestart-1) & Mid(strtext,styleend+8)
styleend=stylestart
else
strtext = Mid(strtext,styleend+8)
exit do
end if
else
exit do
End If
Loop
ct=Len(strtext,"<")
if ct > 0 then
arysplit = Split(pBuff & strtext, "<",-1) 'Prepend prev block remainder
For i =0 To ct-1
gt=InStr(arysplit(i), ">")
If inSCRIPT Then gt=0
tagBuff= lcase(left(arysplit(i),10))
If gt > 0 Then
if Left(tagBuff,3)="!--" Or Left(tagBuff,2)="![" Then
singleELEM = (Mid(arysplit(i),gt-2,2) = "--") or (Mid(arysplit(i),gt-1,1) =
"]")
else
singleELEM = (Mid(arysplit(i),gt-1,1) = "/")
end if
arysplit(i) = Me.Filter(Trim(Mid(arysplit(i), gt + 1 )), baseFilter)
if inTABLE then arysplit(i)= Replace(arysplit(i), vbCRLF,"`" )
arysplit(i) = Replace( arysplit(i), vbCrLf,"`")
arysplit(i) = Replace( arysplit(i), "`" ," ")
arysplit(i) = Replace( arysplit(i)," ","``")
'arysplit(i) = Replace( arysplit(i),"`"," ",1,1)
arysplit(i) = Me.Filter( arysplit(i),"`")
else
pBuff= arysplit(i)
arysplit(i)=""
tagBuff=""
end if
if Left(tagBuff,4)="body" then
inBODY �se( singleELEM,true:false,:true)
elseif Left(tagBuff,1)="p" then
if Not inTD then arysplit(i)= vbCrLf & arysplit(i)
elseif Left(tagBuff,2)="/p" then
arysplit(i)= arysplit(i) & vbCrLf
elseif Left(tagBuff,1)="h" then
arysplit(i)= vbCrLf & arysplit(i)
elseif Left(tagBuff,3)="img" then
arysplit(i)= "[Image Excluded] " & arysplit(i)
elseif Left(tagBuff,2)="/h" then
arysplit(i)= vbCrLf & arysplit(i)
elseif Left(tagBuff,5)="/body" then
inBODY = false
elseif Left(tagBuff,2)="tr" then
inTR�se( singleELEM,true:false,:true)
elseif Left(tagBuff,3)="/tr" then
arysplit(i)= vbCRLF & arysplit(i)
inTR�lse
elseif Left(tagBuff,5)="table" then
inTABLE �se( singleELEM,true:false,:true)
elseif Left(tagBuff,6)="/table" then
inTABLE�lse
elseif Left(tagBuff,2)="td" or Left(tagBuff,2)="th" then
inTD �se( singleELEM,true:false,:true)
elseif Left(tagBuff,3)="/td" or Left(tagBuff,3)="/th" then
if inTD then arysplit(i)= Me.Filter(arysplit(i), vbCRLF) & vbTab
inTD = False
elseif Left(tagBuff,6)="script" and Not inSCRIPT then
inSCRIPT �se( singleELEM,true:false,:true)
elseif Left(tagBuff ,7)="/script" then
inSCRIPT = False
elseif Left(tagBuff,5)="style" then
inSTYLE �se( singleELEM,true:false,:true)
elseif Left(tagBuff,6)="/style" then
inSTYLE = False
elseif Left(tagBuff,3)="!--" then
inCOMMENT �se( singleELEM,true:false,:true)
elseif Left(tagBuff,5)="title" then
inTITLE �se( singleELEM,true:false,:true)
elseif Left(tagBuff,6)="/title" then
inTITLE = False
elseif Left(tagBuff,4)="head" then
inHEAD �se( singleELEM,true:false,:true)
elseif Left(tagBuff,5)="/head" then
inHEAD = False
elseif Left(tagBuff,2)="br" then
arysplit(i)= vbCrLf & arysplit(i)
elseif Left(tagBuff,2)="ol" then
inOL �se( singleELEM,true:false,:true)
arysplit(i)= vbCrLf & vbCrLf & arysplit(i)
PlaceInList=0
elseif Left(tagBuff,3)="/ol" then
inOL = False
PlaceInList=0
elseif Left(tagBuff,2)="ul" then
inUL �se( singleELEM,true:false,:true)
arysplit(i)= vbCrLf & vbCrLf & arysplit(i)
elseif Left(tagBuff,3)="/ul" then
inUL = False
elseif Left(tagBuff,2)="hr" then
if Not Compress then arysplit(i)= vbCrLf & String(70,45) & vbCrLf & arysplit(i)
elseif Left(tagBuff,2)="li" then
If inOL then PlaceInList = PlaceInList + 1
arysplit(i)= vbCrLf & Case(inOL, true: PlaceInList & ". ",: Chr(149) & " ") &
arysplit(i)
elseif Left(tagBuff,3)="/li" then
arysplit(i)= vbCrLf & arysplit(i)
else
End if
if inBODY and Not inSCRIPT and Not inCOMMENT _
and Not inSTYLE then
Do
tesc1 =instr(arysplit(i),"&#")
if tesc1 > 0 then
tesc2 =instr(tesc1,arysplit(i),";")
if tesc2 > tesc1 then
cval =+Mid(arysplit(i),tesc1+2,tesc2-tesc1-2)
if Abs(cval) <= 255 Then
chval = Chr(cval)
else
chval = Chr(151)
end if
arysplit(i)=Left(arysplit(i),tesc1-1) & chval &
Mid(arysplit(i),tesc2+1)
else
exit do
end if
else
exit do
end if
Loop
End If
Next
End If
strtext = Join(arysplit)
EraseArray arysplit
If Compress > 0 then
strtext=Me.Compress(strtext)
else
strtext=Me.SetLineBoundary(strtext,MAX_LINE_LENGTH,vbCrLf) & vbCrLf
End if
Print "%CSP.Page".UnescapeHTML(strtext)
strtext=""
If IsObject(stream)=1 then
if stream.AtEnd then Exit Do
bytes= ReadBytes - Len(pBuff)
strtext=pBuff & stream.Read( bytes)
pBuff=""
End If
Loop
Return
]]></Implementation>
</Method>
<Method name="SetLineBoundary">
<ClassMethod>1</ClassMethod>
<FormalSpec>strIn:%String,Length:%Integer�,Delim:%String</FormalSpec>
<Private>1</Private>
<ReturnType>%String</ReturnType>
<Implementation><![CDATA[
s out="",crlf=$C(13,10),i=0,words=0
s thisword=0
q:strIn="" strIn
For i=1:1:$length(strIn,crlf) {
s line=$ZStrip($P(strIn,crlf,i),"<>W")
if ($Length(line)'>Length) {
s out=out_line_crlf
} else {
s wordct=$Length(line," ")
s newline=""
For thisword=1:1:wordct {
s word=$P(line," ",thisword)_" "
s:$Length(word_newline)>Length out=out_newline_crlf,newline=""
s newline=newline_word }
s out=out_newline_crlf
}
}
quit out
]]></Implementation>
</Method>
<Method name="QPDecode">
<Description>
Decode a Quoted-Printable String</Description>
<ClassMethod>1</ClassMethod>
<FormalSpec><![CDATA[&input:%String]]></FormalSpec>
<ReturnType>%String</ReturnType>
<Implementation><![CDATA[
set $zt="dce"
set in=$tr(input,"_",$c(32))
set text=$piece(in,"=")
for k=2:1:$length(in,"=") {
set p=$piece(in,"=",k)
set h=$extract(p,1,2)
if ($length(h)=2)&&($tr(h,"0123456789abcdefABCDEF")="") {
set text=text_$char($zhex(h))_$extract(p,3,$length(p))
} else {
set text=text_"="_p
}
}
s input=text
quit 1
dce
s $zt=""
q 0
]]></Implementation>
</Method>
<Method name="Replace">
<ClassMethod>1</ClassMethod>
<FormalSpec>ins:%String="",find:%String="",with:%String=""</FormalSpec>
<Language>basic</Language>
<ReturnType>%String</ReturnType>
<Implementation><![CDATA[
Dim nstring as String
nstring = Replace( ins, find, with)
Return nstring
]]></Implementation>
</Method>
<Method name="Compress">
<ClassMethod>1</ClassMethod>
<FormalSpec>str:%String,mode:%Integer=0</FormalSpec>
<Language>basic</Language>
<ReturnType>%String</ReturnType>
<Implementation><![CDATA[
str=Replace(str, " ", "``") 'remove dbl spaces
Return Me.Filter(str, vbCrLf & vbTab & "`" &Chr(160))
]]></Implementation>
</Method>
<Method name="Filter">
<ClassMethod>1</ClassMethod>
<FormalSpec>strtext:%String,filter:%String,replwith:%String=""</FormalSpec>
<Language>cache</Language>
<ReturnType>%String</ReturnType>
<Implementation><![CDATA[ q $TRANSLATE(strtext,filter,replwith)
]]></Implementation>
</Method>
</Class>
<Checksum value="3725873425"/>
</Export>
--------------090501010904050803000208--