Reputation: 2690
How do I remove HTML tags from a string so that I can output clean text?
let str = string.stringByReplacingOccurrencesOfString("<[^>]+>", withString: "", options: .RegularExpressionSearch, range: nil)
print(str)
Upvotes: 124
Views: 71979
Reputation: 6380
Was able to achieve mild success using XML
Event-Based Processing with XMLParser, available on all platforms with Foundation
.
HTML
is not a regular language.HTML
is not XML
, although it is very similar you may need to clean up your HTML
before trying to parse it as XML
.<br>
and <hr>
will make the parsing fail, but <br />
and <hr />
will be parsed as a \n
.NSObject
protocol and event based processing.XMLParser
has not been updated in a long time, thus lacks a lot of new Swift capabilities that we would like to have.Foundation
, but it is only available on macOS.For my own use case I made a class that enables me to use async/await
and asynchronous processing.
Feel free to tweak for your own use case, maybe improving the cleaning process of the original HTML
string.
import Foundation
final class Parser: NSObject, XMLParserDelegate {
private(set) var result = ""
private var finished: (() -> Void)?
private var fail: ((Error) -> Void)?
private var content = ""
init(html: String) async throws {
super.init()
result = try await withUnsafeThrowingContinuation { [weak self] continuation in
// tweak here as needed
let clean = html
.replacingOccurrences(of: "<!DOCTYPE html>",
with: "",
options: .caseInsensitive)
.replacingOccurrences(of: "<br>",
with: "\n",
options: .caseInsensitive)
.replacingOccurrences(of: "<hr>",
with: "\n",
options: .caseInsensitive)
let xml = XMLParser(data: .init(("<xml>" + clean + "</xml>").utf8))
self?.finished = { [weak self] in
xml.delegate = nil
self?.fail = nil
self?.finished = nil
guard let content = self?.content else { return }
continuation
.resume(returning: content
.trimmingCharacters(in:
.whitespacesAndNewlines))
}
self?.fail = { [weak self] in
xml.delegate = nil
self?.fail = nil
self?.finished = nil
xml.abortParsing()
continuation
.resume(throwing: $0)
}
xml.delegate = self
if !xml.parse(),
let error = xml.parserError {
self?.fail?(error)
}
}
}
func parserDidEndDocument(_: XMLParser) {
finished?()
}
func parser(_: XMLParser, parseErrorOccurred: Error) {
fail?(parseErrorOccurred)
}
func parser(_: XMLParser, validationErrorOccurred: Error) {
fail?(validationErrorOccurred)
}
func parser(_: XMLParser, foundCharacters: String) {
content += foundCharacters
}
}
Using some of the examples already given on this post
let string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"
let result = try await Parser(html: string).result
// My First Heading My first paragraph.
let string = "LCD Soundsystem was the musical project of producer <a href='http://www.last.fm/music/James+Murphy' class='bbcode_artist'>James Murphy</a>, co-founder of <a href='http://www.last.fm/tag/dance-punk' class='bbcode_tag' rel='tag'>dance-punk</a> label <a href='http://www.last.fm/label/DFA' class='bbcode_label'>DFA</a> Records. Formed in 2001 in New York City, New York, United States, the music of LCD Soundsystem can also be described as a mix of <a href='http://www.last.fm/tag/alternative%20dance' class='bbcode_tag' rel='tag'>alternative dance</a> and <a href='http://www.last.fm/tag/post%20punk' class='bbcode_tag' rel='tag'>post punk</a>, along with elements of <a href='http://www.last.fm/tag/disco' class='bbcode_tag' rel='tag'>disco</a> and other styles. <br />"
let result = try await Parser(html: string).result
// LCD Soundsystem was the musical project of producer James Murphy, co-founder of dance-punk label DFA Records. Formed in 2001 in New York City, New York, United States, the music of LCD Soundsystem can also be described as a mix of alternative dance and post punk, along with elements of disco and other styles.
let string = "my html <a href=\"\">link text</a>"
let result = try await Parser(html: string).result
// my html link text
Upvotes: 2
Reputation: 1225
Swift 5
extension String {
public func trimHTMLTags() -> String? {
guard let htmlStringData = self.data(using: String.Encoding.utf8) else {
return nil
}
let options: [NSAttributedString.DocumentReadingOptionKey : Any] = [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
]
let attributedString = try? NSAttributedString(data: htmlStringData, options: options, documentAttributes: nil)
return attributedString?.string
}
}
Use:
let str = "my html <a href='https://www.google.com'>link text</a>"
print(str.trimHTMLTags() ?? "--") //"my html link text"
Upvotes: 3
Reputation: 1628
I prefer to use a regular expression than to use NSAttributedString HTML conversion, be advised that is pretty time consuming and need to be run on the main thread too. More information here: https://developer.apple.com/documentation/foundation/nsattributedstring/1524613-initwithdata
For me this made the trick, first I remove any CSS inline styling, and later all the HTML tags. Probably not solid as the NSAttributedString option, but way faster for my case.
extension String {
func withoutHtmlTags() -> String {
let str = self.replacingOccurrences(of: "<style>[^>]+</style>", with: "", options: .regularExpression, range: nil)
return str.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)
}
}
Upvotes: 6
Reputation: 19524
Hmm, I tried your function and it worked on a small example:
var string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"
let str = string.stringByReplacingOccurrencesOfString("<[^>]+>", withString: "", options: .RegularExpressionSearch, range: nil)
print(str)
//output " My First Heading My first paragraph. "
Can you give an example of a problem?
Swift 4 and 5 version:
var string = "<!DOCTYPE html> <html> <body> <h1>My First Heading</h1> <p>My first paragraph.</p> </body> </html>"
let str = string.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)
Upvotes: 186
Reputation: 1202
extension String{
var htmlStripped : String{
return self.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)
}
}
Happy Coding
Upvotes: 7
Reputation: 3347
Updated for Swift 4:
guard let htmlStringData = htmlString.data(using: .unicode) else { fatalError() }
let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [
.documentType: NSAttributedString.DocumentType.html
.characterEncoding: String.Encoding.unicode.rawValue
]
let attributedHTMLString = try! NSAttributedString(data: htmlStringData, options: options, documentAttributes: nil)
let string = attributedHTMLString.string
Upvotes: 2
Reputation: 915
Mohamed solution but as a String extension in Swift 4.
extension String {
func stripOutHtml() -> String? {
do {
guard let data = self.data(using: .unicode) else {
return nil
}
let attributed = try NSAttributedString(data: data, options: [.documentType: NSAttributedString.DocumentType.html, .characterEncoding: String.Encoding.utf8.rawValue], documentAttributes: nil)
return attributed.string
} catch {
return nil
}
}
}
Upvotes: 31
Reputation: 4708
Since HTML is not a regular language (HTML is a context-free language), you cannot use Regular Expressions. See: Using regular expressions to parse HTML: why not?
I would consider using NSAttributedString instead.
let htmlString = "LCD Soundsystem was the musical project of producer <a href='http://www.last.fm/music/James+Murphy' class='bbcode_artist'>James Murphy</a>, co-founder of <a href='http://www.last.fm/tag/dance-punk' class='bbcode_tag' rel='tag'>dance-punk</a> label <a href='http://www.last.fm/label/DFA' class='bbcode_label'>DFA</a> Records. Formed in 2001 in New York City, New York, United States, the music of LCD Soundsystem can also be described as a mix of <a href='http://www.last.fm/tag/alternative%20dance' class='bbcode_tag' rel='tag'>alternative dance</a> and <a href='http://www.last.fm/tag/post%20punk' class='bbcode_tag' rel='tag'>post punk</a>, along with elements of <a href='http://www.last.fm/tag/disco' class='bbcode_tag' rel='tag'>disco</a> and other styles. <br />"
let htmlStringData = htmlString.dataUsingEncoding(NSUTF8StringEncoding)!
let options: [String: AnyObject] = [NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType, NSCharacterEncodingDocumentAttribute: NSUTF8StringEncoding]
let attributedHTMLString = try! NSAttributedString(data: htmlStringData, options: options, documentAttributes: nil)
let string = attributedHTMLString.string
Or, as Irshad Mohamed in the comments would do it:
let attributed = try NSAttributedString(data: htmlString.data(using: .unicode)!, options: [NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType], documentAttributes: nil)
print(attributed.string)
Upvotes: 39
Reputation: 705
swift 4 :
extension String {
func deleteHTMLTag(tag:String) -> String {
return self.replacingOccurrences(of: "(?i)</?\(tag)\\b[^<]*>", with: "", options: .regularExpression, range: nil)
}
func deleteHTMLTags(tags:[String]) -> String {
var mutableString = self
for tag in tags {
mutableString = mutableString.deleteHTMLTag(tag: tag)
}
return mutableString
}
}
Upvotes: 2
Reputation: 23986
I'm using the following extension to remove specific HTML elements:
extension String {
func deleteHTMLTag(tag:String) -> String {
return self.stringByReplacingOccurrencesOfString("(?i)</?\(tag)\\b[^<]*>", withString: "", options: .RegularExpressionSearch, range: nil)
}
func deleteHTMLTags(tags:[String]) -> String {
var mutableString = self
for tag in tags {
mutableString = mutableString.deleteHTMLTag(tag)
}
return mutableString
}
}
This makes it possible to only remove <a>
tags from a string, e.g.:
let string = "my html <a href="">link text</a>"
let withoutHTMLString = string.deleteHTMLTag("a") // Will be "my html link text"
Upvotes: 12