public static string RemoveHTML(string html)
{
try
{
StringCollection sc = new StringCollection();
// get rid of unnecessary tag spans (comments and title)
sc.Add(@"<!--(\w|\W)+?-->");
sc.Add(@"<title>(\w|\W)+?</title>");
// Get rid of classes and styles
sc.Add(@"\s?class=\w+");
sc.Add(@"\s+style='[^']+'");
// Get rid of unnecessary tags
sc.Add(
@"<(meta|link|/?o:|/?style|/?div|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>");
// Get rid of empty paragraph tags
sc.Add(@"(<[^>]+>)+ (</\w+>)+");
// remove bizarre v: element attached to <img> tag
sc.Add(@"\s+v:\w+=""[^""]+""");
// remove extra lines
sc.Add(@"(\n\r){2,}");
sc.Add(" ");
foreach (string s in sc)
{
html = Regex.Replace(html, s, string.Empty, RegexOptions.IgnoreCase);
}
html = System.Text.RegularExpressions.Regex.Replace(html, "<.*?>", String.Empty);
html = html.Replace("\r", " ").Replace("\n", " ").Replace(" ", " ").Trim();
html = html.Replace("’", "'");
html = html.Replace("”", "\"");
html = html.Replace("‘", "'");
html = html.Replace("“", "\"");
html = html.Replace("<", "<");
html = html.Replace(">", ">");
html = html.Replace("&", "&");
}
catch { }
return html;
}
Comments
Post a Comment