-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Lenta grabber #9
base: master
Are you sure you want to change the base?
Changes from 3 commits
1774c38
da7f1e9
5e36938
fbb70e8
5ddb9d1
fe9a54e
890fe34
bdb9349
2775d54
19ca3bf
e35f044
c737907
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<parent> | ||
<groupId>com.spbsu.flamestream.flamenews</groupId> | ||
<artifactId>flamenews</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
<relativePath>../</relativePath> | ||
</parent> | ||
|
||
<artifactId>flamenews-lenta-grabber</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>javax.xml.bind</groupId> | ||
<artifactId>jaxb-api</artifactId> | ||
<version>2.3.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.httpcomponents</groupId> | ||
<artifactId>httpclient</artifactId> | ||
<version>4.2.3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.httpcomponents</groupId> | ||
<artifactId>httpcore</artifactId> | ||
<version>4.2.4</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.jsoup</groupId> | ||
<artifactId>jsoup</artifactId> | ||
<version>1.11.3</version> | ||
</dependency> | ||
</dependencies> | ||
</project> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,326 @@ | ||
package com.spbsu.flamestream.flamenews.lenta; | ||
|
||
import org.apache.http.HttpResponse; | ||
import org.apache.http.client.HttpClient; | ||
import org.apache.http.client.methods.HttpGet; | ||
import org.apache.http.impl.client.DefaultHttpClient; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
import javax.xml.bind.JAXBContext; | ||
import javax.xml.bind.JAXBException; | ||
import javax.xml.bind.Unmarshaller; | ||
import javax.xml.bind.annotation.*; | ||
import javax.xml.transform.stream.StreamSource; | ||
import java.io.*; | ||
import java.util.ArrayList; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
|
||
public class LentaGrabber { | ||
|
||
private static void addNewFile (String dirPath, News news) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Это функционал, который не относится напрямую к LentaGrabber. Давайте сделаем для него интерфейс и вынесем отдельно. |
||
int curNameOfFile = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Между объявлением переменной и ее использованием как-то слишком много левого кода :) |
||
File theDir = new File(dirPath); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Мы все неизменияемые поля и переменные объявляем final, давайте соблюдать такой code-style |
||
if (!theDir.exists()) { | ||
System.out.println("Creating directory: " + dirPath); | ||
try{ | ||
theDir.mkdir(); | ||
System.out.println("Directory created"); | ||
} | ||
catch(SecurityException e){ | ||
System.out.println(e.getMessage()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Давайте логировать не sout-ами, а Logger-ом There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
} | ||
|
||
File[] files = theDir.listFiles(); | ||
for (File file : files) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Это точно нужно делать на каждый вызов метода? |
||
String name = file.getName(); | ||
int num; | ||
try { | ||
num = Integer.parseInt(name.substring(0, name.lastIndexOf("."))); | ||
} catch (NumberFormatException e) { | ||
continue; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Если мы здесь обломались, наверное что-то пошло не так и продолжать не стоит There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Я имел в виду, что если что-то (зачем-то) положил в директорию |
||
} | ||
if (num > curNameOfFile) { | ||
curNameOfFile = num; | ||
} | ||
} | ||
|
||
curNameOfFile++; | ||
String filePath = dirPath + String.valueOf(curNameOfFile) + ".xml"; | ||
try { | ||
FileWriter writer = new FileWriter(filePath, false); | ||
writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); | ||
writer.append('\n'); | ||
writer.write("<item>"); | ||
writer.append('\n'); | ||
writer.write("<title>" + news.getTitle() + "</title>"); | ||
writer.append('\n'); | ||
writer.write("<text>" + news.getText() + "</text>"); | ||
writer.append('\n'); | ||
writer.write("<category>" + news.getCategory() + "</category>"); | ||
writer.append('\n'); | ||
writer.write("</item>"); | ||
writer.flush(); | ||
} | ||
catch (IOException e) { | ||
System.out.println(e.getMessage()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. То же, что и выше |
||
} | ||
} | ||
|
||
private static final String directory = "../news/"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Я бы это парметром передавал в main There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. теперь так |
||
|
||
private static final String URL_LENTA = "https://lenta.ru/rss/news"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Это, возможно, тоже There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. теперь так |
||
|
||
public static void main(String[] args) throws IOException, JAXBException, InterruptedException { | ||
Item oldItem = null; | ||
for(;;) { | ||
List<News> news = new ArrayList<>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Зачем нужен этот список? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. может быть несколько новостей за последнюю минуту |
||
StreamSource xml = new StreamSource(new StringReader(sendRequest(URL_LENTA))); | ||
JAXBContext jaxbContext = JAXBContext.newInstance(RSS.class); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Тут точно нельзя вынести создание некоторых объектов из цикла? Плодить объект на каждую итерацию без неообходимости - не круто There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. исправил |
||
Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller(); | ||
RSS newrss = null; | ||
if (oldItem == null) { | ||
RSS rss = (RSS) jaxbUnmarshaller.unmarshal(xml); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Если нет oldItem мы просто забиваем и ничего не делаем? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. подразумевается что это происходит один раз на первой итерации |
||
oldItem = rss.getChannel().getItems().get(0); | ||
} else { | ||
newrss = (RSS) jaxbUnmarshaller.unmarshal(xml); | ||
List<Item> newItems = newrss.getChannel().getItems(); | ||
int num = 0; | ||
for (int i = 0; i < newItems.size(); i++) { | ||
if (newItems.get(i).getPubDate().equals(oldItem.getPubDate())) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А pubDate точно уникальная? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. неизвестно There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. добавил еще проверку заголовка |
||
num = i; | ||
break; | ||
} | ||
} | ||
// for (Item i : newrss.getChannel().getItems().subList(0, num)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. В чем смысл этого закомменченного кода? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. можно считать что это был мираж |
||
// System.out.println(i.getPubDate()); | ||
// } | ||
if (num > 0) { | ||
System.out.println("news: " + num); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Опять логирование через sout |
||
for (int i = 0; i < num; i++) { | ||
System.out.println(newrss.getChannel().getItems().get(i).getTitle()); | ||
} | ||
} | ||
for (Item i : newrss.getChannel().getItems().subList(0, num)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Я не очень понимаю, зачем здесь нужен subList There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. в newrss 200 последних новостей, это чтобы выбрать только не обработанные ранее |
||
Document doc = Jsoup.connect(i.getLink()).get(); | ||
Element item = doc.getElementById("root"); | ||
Elements links = item.getElementsByTag("p"); | ||
StringBuilder builder = new StringBuilder(); | ||
for (Element link : links) { | ||
builder.append(link.text()).append(" "); | ||
} | ||
news.add(new News(i.getTitle(), i.getCategory(), builder.toString())); | ||
} | ||
oldItem = newrss.getChannel().getItems().get(0); | ||
} | ||
for (News n : news) { | ||
addNewFile(directory, n); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А где отправка XMPP серверу? |
||
} | ||
Thread.sleep(60 * 1000); | ||
} | ||
|
||
} | ||
|
||
static String sendRequest(String url) throws IOException { | ||
HttpClient client = new DefaultHttpClient(); | ||
HttpGet request = new HttpGet(url); | ||
HttpResponse response = client.execute(request); | ||
BufferedReader rd = new BufferedReader | ||
(new InputStreamReader( | ||
response.getEntity().getContent())); | ||
StringBuilder resp = new StringBuilder(); | ||
String line = ""; | ||
while ((line = rd.readLine()) != null) { | ||
resp.append(line); | ||
} | ||
return resp.toString(); | ||
} | ||
} | ||
|
||
|
||
|
||
class News { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Давайте эти классы в отдельные файлы вынесем There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ок |
||
|
||
private String title; | ||
private String category; | ||
private String text; | ||
|
||
public News(String title, String category, String text) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Приоткрытии в идее весь код желтый. Давайте сделаем так, чтобы он был зеленый :) Можем натравить общий codeStyle |
||
this.text = text; | ||
this.category = category; | ||
this.title = title; | ||
} | ||
|
||
public String getText() { | ||
return text; | ||
} | ||
|
||
public void setText(String text) { | ||
this.text = text; | ||
} | ||
|
||
public String getCategory() { | ||
return category; | ||
} | ||
|
||
public void setCategory(String category) { | ||
this.category = category; | ||
} | ||
|
||
public String getTitle() { | ||
return title; | ||
} | ||
|
||
public void setTitle(String title) { | ||
this.title = title; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "News{" + | ||
"title='" + title + '\'' + | ||
", category='" + category + '\'' + | ||
", text='" + text + '\'' + | ||
'}'; | ||
} | ||
} | ||
|
||
@XmlRootElement(name = "rss") | ||
class RSS { | ||
|
||
@XmlElement(name = "channel") | ||
private Channel channel; | ||
|
||
@XmlTransient | ||
public Channel getChannel() { | ||
return channel; | ||
} | ||
|
||
public void setChannel(Channel channel) { | ||
this.channel = channel; | ||
} | ||
} | ||
|
||
|
||
|
||
@XmlRootElement(name = "channel") | ||
class Channel { | ||
|
||
@XmlElement(name = "language") | ||
private String language; | ||
@XmlElement(name = "title") | ||
private String title; | ||
@XmlElement(name = "description") | ||
private String description; | ||
@XmlElement(name = "link") | ||
private String link; | ||
@XmlElement(name = "item") | ||
private List<Item> items = new LinkedList<>(); | ||
|
||
@XmlTransient | ||
public String getLanguage() { | ||
return language; | ||
} | ||
|
||
public void setLanguage(String language) { | ||
this.language = language; | ||
} | ||
|
||
@XmlTransient | ||
public String getTitle() { | ||
return title; | ||
} | ||
|
||
public void setTitle(String title) { | ||
this.title = title; | ||
} | ||
|
||
@XmlTransient | ||
public String getDescription() { | ||
return description; | ||
} | ||
|
||
public void setDescription(String description) { | ||
this.description = description; | ||
} | ||
@XmlTransient | ||
public String getLink() { | ||
return link; | ||
} | ||
|
||
public void setLink(String link) { | ||
this.link = link; | ||
} | ||
|
||
@XmlTransient | ||
public List<Item> getItems() { | ||
return items; | ||
} | ||
|
||
} | ||
|
||
|
||
@XmlRootElement(name = "item") | ||
class Item { | ||
|
||
@XmlElement(name = "title") | ||
private String title; | ||
@XmlElement(name = "link") | ||
private String link; | ||
@XmlElement(name = "description") | ||
private String description; | ||
@XmlElement(name = "pubDate") | ||
private String pubDate; | ||
@XmlElement(name = "category") | ||
private String category; | ||
|
||
@XmlTransient | ||
public String getTitle() { | ||
return title; | ||
} | ||
|
||
public void setTitle(String title) { | ||
this.title = title; | ||
} | ||
|
||
@XmlTransient | ||
public String getLink() { | ||
return link; | ||
} | ||
|
||
public void setLink(String link) { | ||
this.link = link; | ||
} | ||
|
||
@XmlTransient | ||
public String getDescription() { | ||
return description; | ||
} | ||
|
||
public void setDescription(String description) { | ||
this.description = description; | ||
} | ||
|
||
@XmlTransient | ||
public String getPubDate() { | ||
return pubDate; | ||
} | ||
|
||
public void setPubDate(String pubDate) { | ||
this.pubDate = pubDate; | ||
} | ||
|
||
@XmlTransient | ||
public String getCategory() { | ||
return category; | ||
} | ||
|
||
public void setCategory(String category) { | ||
this.category = category; | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Версии мы задаем в parent.pom