Parent directory

HTML parser using regex

The basic intent is to allow someone to grab the html code from a website and parse it based on the tag they provide.

Download

Usage

	HtmlTagParser parser = new HtmlTagParser();
        parser.parseWebsite("https://akiod.net", "<b>(.*?)</b>", 1);
        parser.filter(String f1);
        parser.filter(String f2);
        
        ArrayList tmp = parser.getList();
        for(int i = 0; i < tmp.size(); i++)
        {
            System.out.println(tmp.get(i));
        }
The code above uses the parseWebsite() method to parse the website html code using the regex given and puts the given group in the result arraylist for usage. The getList() method returns the arraylist which holds the matches per the regex & group parameters. Finally the for loop outputs the arraylist which can then be modified and filtered according to your needs.

Main Code

import java.util.regex.Pattern;
import java.util.regex.Matcher;

import java.net.URL;
import java.net.URLConnection;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;

import java.util.ArrayList;

public class HtmlTagParser
{
    private ArrayList result;
    
    private URL u;
    private URLConnection u_c;
    private BufferedReader b_r;
    private String tmp_line;
    
    private Pattern p;
    private Matcher m;
    
    public HtmlTagParser()
    {
        result = new ArrayList();
    }
    public void parse(String input, String pattern, int group)
    {
        p = Pattern.compile(pattern);
        m = p.matcher(input);
        
        while(m.find())
        {
            result.add(m.group(group));
        }
    }
    /**
     * Downloads website html content and parses it by the regex [pattern] and inputs the given [group] in the
     * results arraylist.
     */
    public void parseWebsite(String url, String pattern, int group)
    {
        try
        {
            u = new URL(url);
            u_c = u.openConnection();
            b_r = new BufferedReader(new InputStreamReader(u_c.getInputStream()));
            
            while((tmp_line = b_r.readLine()) != null)
            {
                parse(tmp_line, pattern, group);
            }
            
            b_r.close();
        }
        catch(MalformedURLException e)
        {
            System.out.println("MalformedURLException error.");
        }
        catch(IOException e)
        {
            System.out.println("IO error.");
        }
    }
    /**
     * Removes values from result arraylist which don't contain the [token] parameter.
     */
    public void filter(String token)
    {
       ArrayList tmp = new ArrayList();
       
       for(int i = 0; i < result.size(); i++)
       {
           if(result.get(i).contains(token))
           {
               tmp.add(result.get(i));
           }
       }
       
       result.clear();
       result = tmp;
    }
    public ArrayList getList()
    {
        return result;
    }

Notes