Class | String |
In: |
lib/feedparser/textconverters.rb
lib/feedparser/text-output.rb |
Parent: | Object |
This class provides various converters
returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
# File lib/feedparser/textconverters.rb, line 13 13: def escaped_html? 14: return (self =~ /<img src=/) || (self =~ /<a href=/) || (self =~ /<br(\/| \/|)>/) || (self =~ /<p>/) 15: end
Convert an HTML text to plain text
# File lib/feedparser/text-output.rb, line 6 6: def html2text 7: text = self.clone 8: # parse HTML 9: p = FeedParser::HTML2TextParser::new(true) 10: p.feed(text) 11: p.close 12: text = p.savedata 13: # remove leading and trailing whilespace 14: text.gsub!(/\A\s*/m, '') 15: text.gsub!(/\s*\Z/m, '') 16: # remove whitespace around \n 17: text.gsub!(/ *\n/m, "\n") 18: text.gsub!(/\n */m, "\n") 19: # and duplicates \n 20: text.gsub!(/\n\n+/m, "\n\n") 21: text 22: end
is this text HTML ? search for tags. used by String#text2html
# File lib/feedparser/textconverters.rb, line 8 8: def html? 9: return (self =~ /<p>/) || (self =~ /<\/p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/) || (self =~ /<\/a>/) || (self =~ /<img.*>/) 10: end
Remove white space around the text
# File lib/feedparser/textconverters.rb, line 51 51: def rmWhiteSpace! 52: return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'') 53: end
convert text to HTML
# File lib/feedparser/textconverters.rb, line 35 35: def text2html 36: text = self.clone 37: return text if text.html? 38: if text.escaped_html? 39: return text.unescape_html 40: end 41: # paragraphs 42: text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>') 43: text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>") 44: # uris 45: text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/, 46: '<a href="\1">\1</a>') 47: text 48: end
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales
# File lib/feedparser/textconverters.rb, line 57 57: def toUTF8(inputenc) 58: if inputenc.downcase != 'utf-8' 59: # it is said it is not UTF-8. Ensure it is REALLY not UTF-8 60: begin 61: if self.unpack('U*').pack('U*') == self 62: return self 63: end 64: rescue 65: # do nothing 66: end 67: begin 68: return self.unpack('C*').pack('U*') 69: rescue 70: return self #failsafe solution. but a dirty one :-) 71: end 72: else 73: return self 74: end 75: end
un-escape HTML in the text. used by String#text2html
# File lib/feedparser/textconverters.rb, line 18 18: def unescape_html 19: { 20: '<' => '<', 21: '>' => '>', 22: "'" => ''', 23: '"' => '"', 24: '&' => '&', 25: "\047" => ''', 26: "\046" => '&', 27: "\046" => '&' 28: }.each do |k, v| 29: gsub!(v, k) 30: end 31: self 32: end