본문 바로가기
개발/java

자바 웹크롤링수정

by 카앙구운 2017. 5. 18.
728x90
반응형

public static String getCurrentData(){

SimpleDateFormat sdf = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss");

return sdf.format(new Date());

}

public static void main(String[] args) throws ClientProtocolException, IOException {

//스케줄을 이용하여 1분마다 긁어온다.

ScheduledJob job = new ScheduledJob();

Timer jobScheduler = new Timer();

jobScheduler.scheduleAtFixedRate(job, 1000, 60000);

try {

Thread.sleep(400000);

} catch (InterruptedException e) {

// TODO: handle exception

}

jobScheduler.cancel();


System.out.println("End Date:"+getCurrentData());

}

}

class ScheduledJob  extends TimerTask {

public static String getCurrentData(){

SimpleDateFormat sdf = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss");

return sdf.format(new Date());

}

public void run(){

//1. 가져오기전 시간찍기

System.out.println("Start Date : "+ getCurrentData());

//2.가져올 http 주소 셋팅

//http://www.todayhumor.co.kr/board/view.php?table=bestofbest&no="+no+"&s_no="+s_no+"&page=1

HttpPost httpBob = new HttpPost("http://www.todayhumor.co.kr/board/list.php?table=bestofbest");

HttpClient httpClientBob = HttpClientBuilder.create().build();

HttpResponse responseBob = null;

try {

responseBob = httpClientBob.execute(httpBob);

} catch (ClientProtocolException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

} catch (IOException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

HttpEntity entityBob = responseBob.getEntity();

ContentType contentTypeBob= ContentType.getOrDefault(entityBob);

BufferedReader brBob = null;

try {

brBob = new BufferedReader(new InputStreamReader(entityBob.getContent()));

} catch (UnsupportedOperationException | IOException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

StringBuffer sbBob = new StringBuffer();

String lineBob="";

try {

while((lineBob=brBob.readLine())!=null){

sbBob.append(lineBob + "\n");

}

} catch (IOException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

String sBob=sbBob.toString();

String s2Bob = sBob.substring(sBob.indexOf("bestofbest&no")+14,sBob.indexOf("bestofbest&no")+20);

int no = Integer.parseInt(s2Bob);

int s_no = Integer.parseInt(s2Bob);

for (int i = 20; i >0 ; i--) {

no=no-1;

s_no=s_no-1;

System.out.println("no:"+no);

HttpPost http = new HttpPost("http://www.todayhumor.co.kr/board/view.php?table=bestofbest&no="+no+"&s_no="+s_no+"&page=1");

//3.가져오기를 실행할 클라이언트 객체 생성

HttpClient httpClient = HttpClientBuilder.create().build();

//4.실행 및 실행 데이터를 Response 객체에 담음

HttpResponse response = null;

try {

response = httpClient.execute(http);

} catch (IOException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

//5.Response 받은 데이터 중 , DOM 데이터를 가져와 Entity에 담음

HttpEntity entity = response.getEntity();

//6.Charset을 알아내기 위해 DOM의 컨텐트 타입을 가져와 담고 Charset을 가져옴

ContentType contentType= ContentType.getOrDefault(entity);

Charset charset= contentType.getCharset();

//7.DOM 데이터를 한 줄씩 읽기위해 Reader에 담음(InputStream/Buffered 중 선택은 개인취향)

BufferedReader br = null;

try {

br = new BufferedReader(new InputStreamReader(entity.getContent()));

} catch (UnsupportedOperationException | IOException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

//8.가져온 DOM데이터를 담기위한 그릇

StringBuffer sb = new StringBuffer();

//9.DOM데이터 가지고 오기

String line="";

try {

while((line=br.readLine())!=null){

sb.append(line + "\n");

}

} catch (IOException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

//10.가져온 아름다운 DOM을 보자

//System.out.println(sb.toString());

try {

BufferedWriter out = new BufferedWriter(new FileWriter("out.txt"));

String s= sb.toString();

out.write(s);

out.newLine();

out.close();

} catch (IOException e) {

System.err.println(e);

System.exit(1);

}

String htmlParing=sb.toString();

String parsingEdit="";

int k=0;

ArrayList<Integer> location = new ArrayList<Integer>();

ArrayList<String> imgeName = new ArrayList<String>();

try {

boolean inst =true;

int locationInteger=htmlParing.indexOf("http://thimg.todayhumor.co.kr/upfile");

if(locationInteger>-1){

location.add(locationInteger);

k++;

}else{

inst = false;

}

while(inst){

locationInteger=htmlParing.indexOf("http://thimg.todayhumor.co.kr/upfile",locationInteger+1);

if(locationInteger>-1){

location.add(locationInteger);

k++;

}else{

inst = false;

}

}

for (int j = 0; j < location.size(); j++) {

parsingEdit=htmlParing.substring(location.get(j),location.get(j)+140);


if(parsingEdit.indexOf(".png")>-1){

imgeName.add(parsingEdit.substring(0,parsingEdit.indexOf(".png"))+".png");

}else if(parsingEdit.indexOf(".jpg")>-1){

imgeName.add(parsingEdit.substring(0,parsingEdit.indexOf(".jpg"))+".jpg");

}

}

System.out.println("이미지 개수:"+k);

} catch (Exception e) {

System.out.println("이미지가 없음");

}

try {

//String imgUrl="http://thimg.todayhumor.co.kr/upfile/201705/1494399838fa07198aa28147d48d8b5b7e62fea3c3__mn626423__w1080__h1440__f91058__Ym201705.png";

for (int j = 0; j < imgeName.size(); j++) {

System.out.println("list1:for문:"+imgeName.get(j));

String imgUrl=imgeName.get(j);

URL url = new URL(imgUrl);

String fileName= imgUrl.substring(imgUrl.lastIndexOf('/')+1,imgUrl.length());

String ext=imgUrl.substring(imgUrl.lastIndexOf('.')+1,imgUrl.length());

BufferedImage img= ImageIO.read(url);

File directoryFile= new File("C://FFF");

if(!directoryFile.exists()){

directoryFile.mkdir();

}

ImageIO.write(img, ext, new File("C://FFF//"+fileName));

}

} catch (Exception e) {

System.out.println("이미지가 없습니다.");

}

}

}

728x90
반응형

댓글