Thursday, July 4, 2024

HTML Entity Parser

Given a string str which has various HTML Entities in it, the task is to replace these entities with their corresponding special character.

HTML entity parser is the parser that takes HTML code as input and replaces all the entities of the special characters by the characters itself. The special characters and their entities for HTML are Quotation Mark: the entity is “, and symbol character is “.

Below is the HTML Entities with their corresponding special characters is shown in the table below:

Name/ Description HTML Entity Special Character
Space    
Ampersand & &
Greater than > >
Less than &lt; <
Single Quotation Mark &apos;
Double Quotation Mark &quot;
Trademark &reg; ®
Copyright mark &copy; ©
Forward Slash &frasl; ?

Examples:

Input: str = “17 &gt; 25 and 25 &lt; 17” 
Output: 17 > 25 and 25 < 17 
Explanation: In the above example &gt; is replaced by corresponding special character > and &lt; is replaced by < 

Input: str = “&copy; is symbol of copyright” 
Output: © is symbol of copyright 
Explanation: In the above example &copy; is replaced by corresponding special character ©

Method 1 – using unordered_map: Below are the steps:

  1. Store the HTML Entity with their character in a Map.
  2. Traverse the given string and if any character ‘&’ is encountered then find which HTML Entity is present after this ampersand.
  3. Add the corresponding character with the Entity in the output string.
  4. Print the output string as the result.

Below is the implementation of the above approach: 

C++




// C++ program for the above approach
#include <iostream>
#include <unordered_map>
using namespace std;
 
class GfG {
public:
    unordered_map<string, string> m;
 
public:
    // Associating html entity with
    // special character
    void initializeMap()
    {
        m["""] = "\"";
        m["'"] = "'";
        m["&"] = "&";
        m[">"] = ">";
        m["<"] = "<";
        m["⁄"] = "/";
        m[" "] = " ";
        m["®"] = "®";
        m["©"] = "©";
    }
 
public:
    // Function that convert the given
    // HTML Entity to its parsed String
    string parseInputString(string input)
    {
        // Output string
        string output = "";
 
        // Traverse the string
        for (int i = 0;
             i < input.size(); i++) {
 
            // If any ampersand is occurred
            if (input[i] == '&') {
 
                string buffer;
 
                while (i < input.size()) {
 
                    buffer = buffer + input[i];
 
                    // If any ampersand is occurred
                    if (input[i] == ';'
                        && m.find(buffer)
                               != m.end()) {
 
                        // Append the parsed
                        // character
                        output = output
                                 + m[buffer];
 
                        // Clear the buffer
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
 
                if (i >= input.size()) {
                    output = output
                             + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output
                         + input[i];
            }
        }
 
        // Return the parsed string
        return output;
    }
};
 
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
 
    // Initialised parsed string
    g.initializeMap();
 
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}


Java




import java.util.HashMap;
import java.util.Map;
// Java program for the above approach
public class HtmlEntityParser {
  public static Map<String, String> map = new HashMap<>();
 
  static {
    // Associating html entity with
    // special character
    map.put(""", "\"");
    map.put("'", "'");
    map.put("&", "&");
    map.put(">", ">");
    map.put("<", "<");
    map.put("⁄", "/");
    map.put(" ", " ");
    map.put("®", "®");
    map.put("©", "©");
  }
     // Function that convert the given
    // HTML Entity to its parsed String
  public static String parseInputString(String input) {
    // Output string
    StringBuilder output = new StringBuilder();
    // Traverse the input string
    for (int i = 0; i < input.length(); i++) {
      // If any ampersand is occurred
      if (input.charAt(i) == '&') {
        StringBuilder buffer = new StringBuilder();
        while (i < input.length()) {
          buffer.append(input.charAt(i));
          // If any ampersand is occurred
          if (input.charAt(i) == ';' && map.containsKey(buffer.toString())) {
            // Append the parsed
            // character
            output.append(map.get(buffer.toString()));
            buffer.setLength(0);
            i++;
            break;
          } else {
            i++;
          }
        }
        if (i >= input.length()) {
          output.append(buffer);
          break;
        }
        i--;
      } else {
        output.append(input.charAt(i));
      }
    }
    // convert the string builder into string
    // return the answer.
    return output.toString();
  }
 
  public static void main(String[] args) {
    // Initialize the parse string
    String input = "17 > 25 and 25 < 17";
    // Function call
    System.out.println(parseInputString(input));
  }
}


Python3




# Python program for the above approach
class GfG:
    def __init__(self):
        self.m = {}
 
    # Associating html entity with special character
    def initializeMap(self):
        self.m["""] = "\""
        self.m["'"] = "'"
        self.m["&"] = "&"
        self.m[">"] = ">"
        self.m["<"] = "<"
        self.m["⁄"] = "/"
        self.m[" "] = " "
        self.m["®"] = "®"
        self.m["©"] = "©"
 
    # Function that convert the given
    # HTML Entity to its parsed String
    def parseInputString(self, input):
        # Output string
        output = ""
 
        # Traverse the string
        i = 0
        while i < len(input):
            # If any ampersand is occurred
            if input[i] == '&':
                buffer = ""
 
                while i < len(input):
                    buffer = buffer + input[i]
 
                    # If any semicolon is occurred
                    if input[i] == ';' and buffer in self.m:
                        # Append the parsed character
                        output = output + self.m[buffer]
 
                        # Clear the buffer
                        buffer = ""
                        i += 1
                        break
                    else:
                        i += 1
 
                if i >= len(input):
                    output = output + buffer
                    break
 
                i -= 1
            else:
                output = output + input[i]
 
            i += 1
 
        # Return the parsed string
        return output
 
# Driver Code
if __name__ == '__main__':
    # Given String
    input_str = "17 > 25 and 25 < 17"
    g = GfG()
 
    # Initialised parsed string
    g.initializeMap()
 
    # Function Call
    print(g.parseInputString(input_str))
     
# Contributed by adityasha4x71


Javascript




// JavaScript program for the above approach
class GfG {
  constructor() {
    this.m = {};
  }
 
  // Associating html entity with special character
  initializeMap() {
    this.m["""] = "\"";
    this.m["'"] = "'";
    this.m["&"] = "&";
    this.m[">"] = ">";
    this.m["<"] = "<";
    this.m["⁄"] = "/";
    this.m[" "] = " ";
    this.m["®"] = "®";
    this.m["©"] = "©";
  }
 
  // Function that convert the given
  // HTML Entity to its parsed String
  parseInputString(input) {
    // Output string
    let output = "";
 
    // Traverse the string
    let i = 0;
    while (i < input.length) {
      // If any ampersand is occurred
      if (input[i] === '&') {
        let buffer = "";
 
        while (i < input.length) {
          buffer += input[i];
 
          // If any semicolon is occurred
          if (input[i] === ';' && this.m[buffer]) {
            // Append the parsed character
            output += this.m[buffer];
 
            // Clear the buffer
            buffer = "";
            i++;
            break;
          } else {
            i++;
          }
        }
 
        if (i >= input.length) {
          output += buffer;
          break;
        }
 
        i--;
      } else {
        output += input[i];
      }
 
      i++;
    }
 
    // Return the parsed string
    return output;
  }
}
 
// Driver Code
  // Given String
  const input_str = "17 > 25 and 25 < 17";
  const g = new GfG();
 
  // Initialised parsed string
  g.initializeMap();
 
  // Function Call
  console.log(g.parseInputString(input_str));


C#




// C# program for the above approach
 
using System;
using System.Collections.Generic;
 
public class HtmlEntityParser {
    public static Dictionary<string, string> map = new Dictionary<string, string>()
    {
        // Associating html entity with
        // special character
        { """, "\"" },
        { "'", "'" },
        { "&", "&" },
        { ">", ">" },
        { "<", "<" },
        { "⁄", "/" },
        { " ", " " },
        { "®", "®" },
        { "©", "©" }
    };
 
    // Function that convert the given
    // HTML Entity to its parsed String
    public static string ParseInputString(string input)
    {
        // Output string
        var output = new System.Text.StringBuilder();
 
        // Traverse the input string
        for (int i = 0; i < input.Length; i++)
        {
            // If any ampersand is occurred
            if (input[i] == '&')
            {
                var buffer = new System.Text.StringBuilder();
                while (i < input.Length)
                {
                    buffer.Append(input[i]);
                    // If any ampersand is occurred
                    if (input[i] == ';' && map.ContainsKey(buffer.ToString()))
                    {
                        // Append the parsed
                        // character
                        output.Append(map[buffer.ToString()]);
                        buffer.Length = 0;
                        i++;
                        break;
                    }
                    else
                    {
                        i++;
                    }
                }
                if (i >= input.Length)
                {
                    output.Append(buffer);
                    break;
                }
                i--;
            }
            else
            {
                output.Append(input[i]);
            }
        }
        // convert the string builder into string
        // return the answer.
        return output.ToString();
    }
 
    public static void Main(string[] args)
    {
        // Initialize the parse string
        string input = "17 > 25 and 25 < 17";
        // Function call
        Console.WriteLine(ParseInputString(input));
    }
}
// Contributed by adityasharmadev01


Output:

17 > 25 and 25 < 17

Time Complexity: O(N) 
Auxiliary Space: O(N) 

Method 2 – using Pattern Matching: Below are the steps:

  1. Traverse the given string str.
  2. While traversing, if any character ‘&’ is encountered then find which HTML Entity is present after this ampersand.
  3. Add the corresponding character with the Entity in the output string from the above table of matched character in the above table.
  4. Print the output string as the result after traversing the above string.

Below is the implementation of the above approach: 

C++




// C++ program to Parse the HTML Entities
#include <iostream>
using namespace std;
 
class GfG {
 
public:
    string parseInputString(string input)
    {
 
        // To store parsed string
        string output = "";
 
        for (int i = 0;
             i < input.size(); i++) {
 
            // Matching pattern of html
            // entity
            if (input[i] == '&') {
                string buffer;
 
                while (i < input.size()) {
                    buffer = buffer + input[i];
 
                    // Check match for (\)
                    if (input[i] == ';'
                        && buffer == """) {
                        output = output + "\"";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (')
                    else if (input[i] == ';'
                             && buffer == "'") {
                        output = output + "'";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (&)
                    else if (input[i] == ';'
                             && buffer == "&") {
                        output = output + "&";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (>)
                    else if (input[i] == ';'
                             && buffer == ">") {
                        output = output + ">";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (<)
                    else if (input[i] == ';'
                             && buffer == "<") {
                        output = output + "<";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (/)
                    else if (input[i] == ';'
                             && buffer == "⁄") {
                        output = output + "/";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (" ")
                    else if (input[i] == ';'
                             && buffer == " ") {
                        output = output + " ";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (®)
                    else if (input[i] == ';'
                             && buffer == "®") {
                        output = output + "®";
                        buffer = "";
                        i++;
                        break;
                    }
 
                    // Check match for (©)
                    else if (input[i] == ';'
                             && buffer == "©") {
                        output = output + "©";
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
 
                if (i >= input.size()) {
                    output = output + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output + input[i];
            }
        }
 
        // Return the parsed string
        return output;
    }
};
 
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
 
    // Initialised parsed string
    g.initializeMap();
 
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}


Output:

17 > 25 and 25 < 17

Time Complexity: O(N) 
Auxiliary Space: O(N) 

Method 3 – using Regular Expression: Below are the steps:

  1. Store all the expression with it’s mapped value in a Map M.
  2. For each key in the map, create a regular expression using:

regex e(key);

  1. Now replace the above regular expression formed with it’s mapped value in the Map M as:

regex_replace(str, e, value); where, str is the input string, e is the expression formed in the above step, and val is the value mapped with expression e in the Map

  1. Repeat the above steps until all expression are not replaced.

Below is the implementation of the above approach: 

C++




// C++ program for the above approach
#include <iostream>
#include <regex>
#include <unordered_map>
using namespace std;
 
// Given Expression with mapped value
const unordered_map<string, string> m;
m = { { """, "\" },
        { "'", "'" },
        { "&", "&" },
        { ">", ">" },
        { "<", "<" },
        { "⁄", "/" } };
 
// Function that converts the given
// HTML Entity to its parsed String
string
parseInputString(string input)
{
    for (auto& it : m) {
 
        // Create ReGex Expression
        regex e(it.first);
 
        // Replace the above expression
        // with mapped value using
        // regex_replace()
        input = regex_replace(input, e,
                              it.second);
    }
 
    // Return the parsed string
    return input;
}
 
// Driver Code
int main()
{
    // Given String
    string input
        = "17 > 25 and 25 < 17";
 
    // Function Call
    cout << parseInputString(input);
    return 0;
}


Output:

17 > 25 and 25 < 17

Time Complexity: O(N) 
Auxiliary Space: O(N)

Feeling lost in the world of random DSA topics, wasting time without progress? It’s time for a change! Join our DSA course, where we’ll guide you on an exciting journey to master DSA efficiently and on schedule.
Ready to dive in? Explore our Free Demo Content and join our DSA course, trusted by over 100,000 neveropen!

Nokonwaba Nkukhwana
Experience as a skilled Java developer and proven expertise in using tools and technical developments to drive improvements throughout a entire software development life cycle. I have extensive industry and full life cycle experience in a java based environment, along with exceptional analytical, design and problem solving capabilities combined with excellent communication skills and ability to work alongside teams to define and refine new functionality. Currently working in springboot projects(microservices). Considering the fact that change is good, I am always keen to new challenges and growth to sharpen my skills.
RELATED ARTICLES

LEAVE A REPLY

Please enter your comment!
Please enter your name here

Most Popular

Recent Comments