HTML To TXT (Part 2)

by 曾经沧海
499 阅读

// 把所有&xxx的转义;所有<xxx>取消;其它照样返回
  function ConvertHTMLToken(const s:string;var inPre:boolean):string;
  var
    s0,s0_2,s0_3,s0_4:string;
  begin
    if s='' then
    begin
      result:='';
      exit;
    end;
    if s[1]='&' then
    begin
      s0:=lowerCase(s);
      result:='';
      if s0=' ' then result:=' '
      else if s0='"' then result:='"'
      else if s0='>' then result:='>'
      else if s0='<' then result:='<'
      else if s0='·' then result:='·'
      else if s0='™' then result:=' TM '
      else if s0='©' then result:='(c)'
      else if s0='®' then result:='(R)'
      else if s0='&amp' then result:='&';
    end
    else if s[1]='<' then
    begin
      s0:=lowerCase(s);
      s0_2:=copy(s0,1,2);
      s0_3:=copy(s0,1,3);
      s0_4:=copy(s0,1,4);
 
      result:='';
      // 将所有<hr>替换成为'——'
      if s0='<br>' then result:=CR
      else if s0_4='<pre' then   // <pre 一定要在 <p 之前判断!
           begin inPre:=true;result:=CR; end
      else if s0_2='<p' then result:=CR+CR
      else if s0_3='<hr' then result:=CR+MakeStr('-',40)+CR
      else if s0_3='<ol' then result:=CR
      else if s0_3='<ul' then result:=CR
      else if s0_3='<li' then result:='·'
      else if s0_4='</li' then result:=CR
      else if s0_4='</tr' then result:=CR
      else if s0='</td>' then result:=#9
      else if s0='<title>' then result:='《'
      else if s0='</title>' then result:='》'+CR+CR
      else if s0='</pre>' then inPre:=false
      else if copy(s0,1,6)='<table' then result:=CR
      else if MarkLinks and (s0[2]='a') then
           begin
             CurrLink:=GetLink(s);
             if CurrLink<>'' then result:='[';
           end
      else if MarkLinks and (s0='</a>') then
             if CurrLink<>'' then result:=format(' %s ]',[CurrLink]);
    end
    else if inPre then
      result:=s
    else // 不在<pre>..</pre>内,则删除所有CR
      result:=ReplaceStr(s,CR,'');
  end;

begin
  s0:=UnixToDos(HTMLText);
  result:='';
  InputLen:=length(s0);
  InputIdx:=1;
  inPre:=false;
  CurrLink:='';

  while InputIdx<=InputLen do
  begin
    NextToken:=GetNextToken(s0,InputIdx);

    // 去除<style …> — </style>之间的内容
    if lowercase(copy(NextToken,1,6))='<style' then
    begin
      while lowercase(NextToken)<>'</style>' do
      begin
        inc(InputIdx,length(NextToken));
        NextToken:=GetNextToken(s0,InputIdx);
      end;
      inc(InputIdx,length(NextToken));
      NextToken:=GetNextToken(s0,InputIdx);
    end;

    // 去除<Script …> — </Script>之间的内容
    if lowercase(copy(NextToken,1,7))='<script' then
    begin
      inc(InputIdx,length(NextToken));
      inQuot:=false;
      i:=InputIdx-1;
      while I<InputLen do
      begin
        inc(i);
        if s0[i]='"' then
        begin
          inQuot:=not inQuot;
          continue;
        end;
        if not inQuot then
          // 去除<script>段里的<!– … –>注释段, 99.8.2
          if copy(s0,i,4)='<!–' then
          begin
            HelpIdx:=pos('–>',copy(s0,i+4,MaxInt));
            if HelpIdx>0 then
            begin
     &
nbsp;        inc(i,4+HelpIdx+2);
            end
            else
            begin
              i:=InputLen;
              break;
            end;
          end;
          if lowercase(copy(s0,i,9))='</script>' then
          begin
            break;
          end;
      end;
      InputIdx:=i;
    end;

    NextToken:=GetNextToken(s0,InputIdx);
    inc(InputIdx,length(NextToken));
    result:=result+ConvertHTMLToken(NextToken,inPre);
  end;
end;

发表评论