Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lenta grabber #9

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions flamenews/lenta-grabber/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.spbsu.flamestream.flamenews</groupId>
<artifactId>flamenews</artifactId>
<version>1.0-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>

<artifactId>flamenews-lenta-grabber</artifactId>
<version>1.0-SNAPSHOT</version>

<dependencies>
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.2.3</version>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Версии мы задаем в parent.pom

</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.2.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,326 @@
package com.spbsu.flamestream.flamenews.lenta;

import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.annotation.*;
import javax.xml.transform.stream.StreamSource;
import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

public class LentaGrabber {

private static void addNewFile (String dirPath, News news) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это функционал, который не относится напрямую к LentaGrabber. Давайте сделаем для него интерфейс и вынесем отдельно.

int curNameOfFile = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Между объявлением переменной и ее использованием как-то слишком много левого кода :)

File theDir = new File(dirPath);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Мы все неизменияемые поля и переменные объявляем final, давайте соблюдать такой code-style

if (!theDir.exists()) {
System.out.println("Creating directory: " + dirPath);
try{
theDir.mkdir();
System.out.println("Directory created");
}
catch(SecurityException e){
System.out.println(e.getMessage());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Давайте логировать не sout-ами, а Logger-ом

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • в этом случае дальнейшее исполнение вряд ли имеет смысл, поэтому лучше бросить RuntimeException
catch(SecurityException e){
   throw new RuntimeException(e);
}

}
}

File[] files = theDir.listFiles();
for (File file : files) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это точно нужно делать на каждый вызов метода?

String name = file.getName();
int num;
try {
num = Integer.parseInt(name.substring(0, name.lastIndexOf(".")));
} catch (NumberFormatException e) {
continue;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Если мы здесь обломались, наверное что-то пошло не так и продолжать не стоит

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я имел в виду, что если что-то (зачем-то) положил в директорию ../news/ что-то странное с дурацким именем, то он, конечно, человек нехороший, но ну и пусть. Но если не пусть, то подправлю.

}
if (num > curNameOfFile) {
curNameOfFile = num;
}
}

curNameOfFile++;
String filePath = dirPath + String.valueOf(curNameOfFile) + ".xml";
try {
FileWriter writer = new FileWriter(filePath, false);
writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
writer.append('\n');
writer.write("<item>");
writer.append('\n');
writer.write("<title>" + news.getTitle() + "</title>");
writer.append('\n');
writer.write("<text>" + news.getText() + "</text>");
writer.append('\n');
writer.write("<category>" + news.getCategory() + "</category>");
writer.append('\n');
writer.write("</item>");
writer.flush();
}
catch (IOException e) {
System.out.println(e.getMessage());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

То же, что и выше

}
}

private static final String directory = "../news/";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я бы это парметром передавал в main

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

теперь так


private static final String URL_LENTA = "https://lenta.ru/rss/news";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это, возможно, тоже

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

теперь так


public static void main(String[] args) throws IOException, JAXBException, InterruptedException {
Item oldItem = null;
for(;;) {
List<News> news = new ArrayList<>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Зачем нужен этот список?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

может быть несколько новостей за последнюю минуту

StreamSource xml = new StreamSource(new StringReader(sendRequest(URL_LENTA)));
JAXBContext jaxbContext = JAXBContext.newInstance(RSS.class);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тут точно нельзя вынести создание некоторых объектов из цикла? Плодить объект на каждую итерацию без неообходимости - не круто

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

исправил

Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
RSS newrss = null;
if (oldItem == null) {
RSS rss = (RSS) jaxbUnmarshaller.unmarshal(xml);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Если нет oldItem мы просто забиваем и ничего не делаем?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

подразумевается что это происходит один раз на первой итерации

oldItem = rss.getChannel().getItems().get(0);
} else {
newrss = (RSS) jaxbUnmarshaller.unmarshal(xml);
List<Item> newItems = newrss.getChannel().getItems();
int num = 0;
for (int i = 0; i < newItems.size(); i++) {
if (newItems.get(i).getPubDate().equals(oldItem.getPubDate())) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А pubDate точно уникальная?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

неизвестно

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

добавил еще проверку заголовка

num = i;
break;
}
}
// for (Item i : newrss.getChannel().getItems().subList(0, num)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В чем смысл этого закомменченного кода?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

можно считать что это был мираж

// System.out.println(i.getPubDate());
// }
if (num > 0) {
System.out.println("news: " + num);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Опять логирование через sout

for (int i = 0; i < num; i++) {
System.out.println(newrss.getChannel().getItems().get(i).getTitle());
}
}
for (Item i : newrss.getChannel().getItems().subList(0, num)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я не очень понимаю, зачем здесь нужен subList

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

в newrss 200 последних новостей, это чтобы выбрать только не обработанные ранее

Document doc = Jsoup.connect(i.getLink()).get();
Element item = doc.getElementById("root");
Elements links = item.getElementsByTag("p");
StringBuilder builder = new StringBuilder();
for (Element link : links) {
builder.append(link.text()).append(" ");
}
news.add(new News(i.getTitle(), i.getCategory(), builder.toString()));
}
oldItem = newrss.getChannel().getItems().get(0);
}
for (News n : news) {
addNewFile(directory, n);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А где отправка XMPP серверу?

}
Thread.sleep(60 * 1000);
}

}

static String sendRequest(String url) throws IOException {
HttpClient client = new DefaultHttpClient();
HttpGet request = new HttpGet(url);
HttpResponse response = client.execute(request);
BufferedReader rd = new BufferedReader
(new InputStreamReader(
response.getEntity().getContent()));
StringBuilder resp = new StringBuilder();
String line = "";
while ((line = rd.readLine()) != null) {
resp.append(line);
}
return resp.toString();
}
}



class News {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Давайте эти классы в отдельные файлы вынесем

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ок


private String title;
private String category;
private String text;

public News(String title, String category, String text) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Приоткрытии в идее весь код желтый. Давайте сделаем так, чтобы он был зеленый :) Можем натравить общий codeStyle

this.text = text;
this.category = category;
this.title = title;
}

public String getText() {
return text;
}

public void setText(String text) {
this.text = text;
}

public String getCategory() {
return category;
}

public void setCategory(String category) {
this.category = category;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

@Override
public String toString() {
return "News{" +
"title='" + title + '\'' +
", category='" + category + '\'' +
", text='" + text + '\'' +
'}';
}
}

@XmlRootElement(name = "rss")
class RSS {

@XmlElement(name = "channel")
private Channel channel;

@XmlTransient
public Channel getChannel() {
return channel;
}

public void setChannel(Channel channel) {
this.channel = channel;
}
}



@XmlRootElement(name = "channel")
class Channel {

@XmlElement(name = "language")
private String language;
@XmlElement(name = "title")
private String title;
@XmlElement(name = "description")
private String description;
@XmlElement(name = "link")
private String link;
@XmlElement(name = "item")
private List<Item> items = new LinkedList<>();

@XmlTransient
public String getLanguage() {
return language;
}

public void setLanguage(String language) {
this.language = language;
}

@XmlTransient
public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

@XmlTransient
public String getDescription() {
return description;
}

public void setDescription(String description) {
this.description = description;
}
@XmlTransient
public String getLink() {
return link;
}

public void setLink(String link) {
this.link = link;
}

@XmlTransient
public List<Item> getItems() {
return items;
}

}


@XmlRootElement(name = "item")
class Item {

@XmlElement(name = "title")
private String title;
@XmlElement(name = "link")
private String link;
@XmlElement(name = "description")
private String description;
@XmlElement(name = "pubDate")
private String pubDate;
@XmlElement(name = "category")
private String category;

@XmlTransient
public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

@XmlTransient
public String getLink() {
return link;
}

public void setLink(String link) {
this.link = link;
}

@XmlTransient
public String getDescription() {
return description;
}

public void setDescription(String description) {
this.description = description;
}

@XmlTransient
public String getPubDate() {
return pubDate;
}

public void setPubDate(String pubDate) {
this.pubDate = pubDate;
}

@XmlTransient
public String getCategory() {
return category;
}

public void setCategory(String category) {
this.category = category;
}

}