Java爬取网站信息

 被注释掉的是爬取名字,没注释的爬取姓氏,下面链接直接挑战IO有详解

这里跳转IO详解,看完到最下面也有icon-default.png?t=N7T8https://blog.csdn.net/m0_71149935/article/details/134619732?spm=1001.2014.3001.5502

import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UIR {
    public static void main(String[] args) throws IOException {
//        String str1="https://hanyu.baidu.com/shici/detail?from=aladdin&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&showPinyin=1";
//        String regex1="(.{4})(,|。)";
//        ArrayList<String> data = getData(str1, regex1,1);
//        System.out.println(data);

        String str2="https://www.shopwang.net/68576739.html";
        String regex2="、(.{2})";
        ArrayList<String> data1 = getData(str2, regex2, 1);
        System.out.println(data1);
    }
    public static ArrayList<String> getData(String str,String regex,int index) throws IOException {
        ArrayList<String> list=new ArrayList<>();
        String url = Url(str);
        Pattern compile = Pattern.compile(regex);
        Matcher matcher = compile.matcher(url);
        while (matcher.find()){
            list.add(matcher.group(index));
        }
        return list;
    }

    public static String Url(String s) throws IOException {
        URL url=new URL(s);
        URLConnection urlConnection = url.openConnection();
        InputStreamReader isr=new InputStreamReader(urlConnection.getInputStream());
        StringBuilder sb=new StringBuilder();
        int i;
        while ((i= isr.read())!=-1){
            sb.append((char) i);
        }
        isr.close();
        return sb.toString();
    }
}

点这里直接跳转IO详解icon-default.png?t=N7T8https://blog.csdn.net/m0_71149935/article/details/134619732?spm=1001.2014.3001.5502