Here's the HTML stripper originally made by Kevin Picone in PlayBASIC, edited into DBPRo-friendly format.
ALso requires IanM MatrixUtils plugin.
Just dumping it here in case anybody wanted it later.
Rem Project: TEST HTML Stripper
Rem Created: Wednesday, December 21, 2016
Rem ***** Main Source File *****
` ----------------------------------------------------------------------------
` ----------------------------------------------------------------------------
` --{ STRIP HTML FROM STRING }-------------------------------------------
` ----------------------------------------------------------------------------
` ----------------------------------------------------------------------------
REMSTART
This function skims the input string and strips anything that looks like
a html TAG. The code only supports a few paired tags, so if you need more
add them to the select statement in the middle.
REMEND
Html$ = ""
HtmlNew$ = ""
open to read 1, "webpage.txt"
WHILE FILE END(1) = 0
`IF FILE END(1) = 0
read string 1, HtmlNew$
Html$ = Html$ + HtmlNew$
`ENDIF
ENDWHILE
CleanText$=Strip_Html_From_String(Html$)
sync on: sync rate 60
desktopwidth=desktop width()
desktopheight=desktop height()
set display mode desktopwidth, desktopheight,32,1
Set window position -2,-18
backdrop on
color backdrop 0
disable escapekey
`#######################################################################################################################################################################################
`#######################################################################################################################################################################################
`#######################################################################################################################################################################################
DO
set cursor 0, 0
PRINT CleanText$
SYNC
LOOP
`#######################################################################################################################################################################################
`#######################################################################################################################################################################################
`#######################################################################################################################################################################################
Function Strip_Html_From_String(Html$)
HtmlSize=Len(Html$)
TextOutput$=""
for lp=1 to HtmlSize
`ThisChr=mid(Html$,lp)
ThisChr=mid ASCII(Html$,lp)
`ThisChrSTR$ = MID$(Html$,lp)
`ThisChr=ASC(ThisChrSTR$)
if Thischr=asc("<")
if Lp+2 <= HtmlSize
`NextChr=mid(Html$,lp+1)
NextChr=mid ASCII(Html$,lp+1)
`NextChrSTR$ = MID$(Html$,lp+1)
`NextChr=ASC(NextChrSTR$)
if NextChr=asc("?")
// DETECT COMMENT TAG, if so, find closing and skip completely
closetag=INSTR(html$,"?>",lp)
if closeTag>lp
Lp=closetag+2
`continue
`EXITFUNCTION
endif
endif
// check if Next char is the closing tag (assuming it's tight against the less than chr
if NextChr=asc("/")
`NextChr=mid(Html$,lp+2)
NextChr=mid ASCII(Html$,lp+2)
`NextChrSTR$ = MID$(Html$,lp+2)
`NextChr=ASC(NextChrSTR$)
endif
// ----------------------------------
// Is the next character a DOCTYPE ?
// ----------------------------------
if NextChr=asc("!")
`NextChr=mid(Html$,lp+2)
NextChr=mid ASCII(Html$,lp+2)
`NextChrSTR$ = MID$(Html$,lp+2)
`NextChr=ASC(NextChrSTR$)
//this might be a comment
`if mid(Html$,lp+2)=asc("-")
if mid ASCII(Html$,lp+2)=asc("-")
`if mid(Html$,lp+3)=asc("-")
if mid ASCII(Html$,lp+3)=asc("-")
// DETECT COMMENT TAG, if so, find closing and skip completely
`closetag=instring(html$,"-->",lp)
closetag=INSTR(html$,"-->",lp)
if closeTag>lp
Lp=closetag+2
`continue
`EXITFUNCTION
endif
endif
endif
endif
if (NextChr=>asc("a") and NextChr<=asc("z")) or (NextChr=>asc("A") and NextChr<=asc("Z"))
// look
`CloseTag=instring(Html$,">",lp+1)
CloseTag=INSTR(Html$,">",lp+1)
if CloseTag>lp
WhiteSpaceFound= 0
//find the first white charcter after the alphabet chr, might be
for SearchLP=lp+1 to CloseTag
`FindChr=mid(html$,Searchlp)
FindChr=mid ASCII(html$,Searchlp)
if Findchr=32 or findchr=9
WhiteSpaceFound =SearchLP
`exitfor SearchLP
SearchLP = CloseTag
endif
next
if WhiteSpaceFound>0
// looks like tag
Tag$=MID$(Html$,lp+1,WhiteSpaceFound-lp)
else
Tag$=MID$(Html$,lp+1,CloseTag-(lp+1))
endif
// --------------------------------------------------------
// TRAP TAGS and parse out any properties you might want
// --------------------------------------------------------
FindClosingTag=0
tag$=trim$(Tag$)
`tag$=REMOVE ALL$(Tag$)
select upper$(tag$)
// --------------------------------------------------------
// --------------------------------------------------------
case "IMG"
// --------------------------------------------------------
// grab the coplete tag,
`FullTag$=mid$(Html$,lp+1,CloseTag-(lp+1))
FullTag$=MID$(Html$,lp+1,CloseTag-(lp+1))
// pull out alternate text if you need it here
AltString$=GetProperty(FullTag$,"alt")
`TextOutput$+=" "+AltString$+" "
TextOutput$ =TextOutput$ + " "+AltString$+" "
// Filename$=GetProperty(FullTag$,"src")
// filename$=getfilename$(Filename$)
// TextOutput$+=" "+filename$+" "
ENDCASE
// --------------------------------------------------------
case "SCRIPT","STYLE"
// --------------------------------------------------------
// Handle PAIRED TAGS, so we're assuming everything between
// this tags closing statement is junk and be removed
`FindClosingTag= true
FindClosingTag= 1
ENDCASE
EndSelect
lp=closeTag
`if FindClosingTag=true
if FindClosingTag = 1
// assuming closing tag is in the same form it might be in < /tag>
endtag$="<"+"/"+tag$+">"
`endtagpos=instring(html$,Endtag$,closetag+1)
endtagpos=INSTR(html$,Endtag$,closetag+1)
if EndTagPos>CloseTag
lp=EndTagPos+len(endtag$)-1
endif
endif
else
// no closing > found so just output this as a char
goto OutputCHR
endif
else
// this seems to be a stand alone < char and not a tag
goto OutputCHR
endif
else
goto OutputCHR
// are we more than 2 chrs from end ?
endif
else
OutputCHR:
// drop this charcter to the output string
`TextOutput$+=Chr$(ThisChr)
`TextOutput$=Chr$(ThisChr)
TextOutput$=TextOutput$ + Chr$(ThisChr)
// endof of < check
endif
Done:
next
// brute force replace common character set encodings
TextOutput$=REPLACE ALL$(TextOutput$,"&"+"nbsp;"," ")
TextOutput$=REPLACE ALL$(TextOutput$,"&"+"lt;","<")
TextOutput$=REPLACE ALL$(TextOutput$,"&"+"gt;",">")
TextOutput$=REPLACE ALL$(TextOutput$,chr$(13)+chr$(10),"")
TextOutput$=REPLACE ALL$(TextOutput$,chr$(10),"")
Textoutput$=Single_Space_String(TextOutput$)
// search for rip doubl
EndFunction TextOutput$
`#######################################################################################################################################################################################
FUNCTION GetProperty(Tag$,Property$)
StartTag$=Property$+"="+chr$(34)
`Startpos=instring(tag$,StartTag$)
Startpos=INSTR(tag$,StartTag$)
if Startpos
StartPos=len(StartTag$)
`Endpos =instring(tag$,chr$(34),startpos)
Endpos =INSTR(tag$,chr$(34),startpos)
if EndPos>StartPOs
Result$=MID$(Tag$,StartPOs,EndPos-StartPOs)
goto done
ELSE
`result$=""
endif
endif
result$=""
Done:
ENDFUNCTION Result$
`#######################################################################################################################################################################################
FUNCTION Single_Space_String(S$)
result$=""
Size=Len(s$)
for lp=1 to size
`Thischr=mid(s$,lp)
Thischr=mid ASCII(s$,lp)
if ThisChr=32 or ThisChr=9
`NExtchr=mid(s$,lp+1)
NExtchr=mid ASCII(s$,lp+1)
if NextChr=32 or NextChr=9
for skiplp=lp+1 to Size
`NExtchr=mid(s$,SkipLP)
NExtchr=mid ASCII(s$,SkipLP)
if NextChr=32 or NextChr=9
lp=SkipLP
else
`exitfor skiplp
skiplp=Size
endif
next
endif
// output space
result$=result$ + chr$(32)
else
result$= result$ + chr$(ThisChr)
endif
next
ENDFUNCTION result$
`#######################################################################################################################################################################################
Replace the webpage.txt in open to read 1, "webpage.txt" with any HTML files you want to strip. Put it in your project folder
You can try it with this file: