Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lenta grabber #9

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions flamenews/lenta-grabber/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.spbsu.flamestream.flamenews</groupId>
<artifactId>flamenews</artifactId>
<version>1.0-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>

<artifactId>flamenews-lenta-grabber</artifactId>
<version>1.0-SNAPSHOT</version>

<dependencies>
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package com.spbsu.flamestream.flamenews.lenta;

import com.spbsu.flamestream.flamenews.lenta.model.Item;
import com.spbsu.flamestream.flamenews.lenta.model.News;
import com.spbsu.flamestream.flamenews.lenta.model.RSS;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.transform.stream.StreamSource;
import java.awt.*;
import java.io.*;
import java.text.ParseException;
import java.util.List;
import java.util.Map;
import java.time.format.DateTimeFormatter;
import java.time.LocalDateTime;


public class LentaGrabber {
public static void main(String[] args) throws IOException, JAXBException, InterruptedException {
final Map<String, String> env = System.getenv();
final String urlLenta = env.getOrDefault("LENTA_URL", "https://lenta.ru/rss/news");
final String directory = env.getOrDefault("OUT_DIR", "../news/");
final String lastSavedNews = directory + "lastSavedNews";

final JAXBContext jaxbContext = JAXBContext.newInstance(RSS.class);
final Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
final HttpClient client = new DefaultHttpClient();
final HttpGet request = new HttpGet(urlLenta);

final NewsSaver saveNews = new NewsSaver(directory, lastSavedNews, -1);
while (!Thread.currentThread().isInterrupted()) {
final HttpResponse response = client.execute(request);
final BufferedReader rd = new BufferedReader
(new InputStreamReader(
response.getEntity().getContent()));
final StringBuilder resp = new StringBuilder();
String line;
while ((line = rd.readLine()) != null) {
resp.append(line);
}
final StreamSource xml = new StreamSource(new StringReader(resp.toString()));
final RSS rss = (RSS) jaxbUnmarshaller.unmarshal(xml);
final List<Item> itemList = rss.getChannel().getItems();
for (int k = itemList.size(); k > 0; k--) {
final Item i = itemList.get(k-1);
final Document doc = Jsoup.connect(i.getLink()).get();
final Element item = doc.getElementById("root");
final Elements links = item.getElementsByTag("p");
final StringBuilder builder = new StringBuilder();
for (Element link : links) {
builder.append(link.text()).append(" ");
}
DateTimeFormatter formatter = DateTimeFormatter.RFC_1123_DATE_TIME;
LocalDateTime pubDate = LocalDateTime.parse(i.getPubDate(), formatter);
saveNews.save(new News(i.getTitle(), i.getCategory(), builder.toString(), pubDate));

}
return;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package com.spbsu.flamestream.flamenews.lenta;

import java.io.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.logging.Logger;

import com.spbsu.flamestream.flamenews.lenta.model.News;

public class NewsSaver implements SaveToFile {
private static Logger log = Logger.getLogger(NewsSaver.class.getName());

private final String dirPath;
private final String lastSavedNews;
private int prevName;

NewsSaver(String dirPath, String lastSavedNews, int prevName) {
this.dirPath = dirPath;
this.lastSavedNews = lastSavedNews;
this.prevName = prevName;
}

public void save(News news) {
final File theDir = new File(dirPath);
if (!theDir.exists()) {
log.info("Creating directory: " + dirPath);
try {
if (!theDir.mkdir()) {
throw new RuntimeException("Can't create dir: " + dirPath);
}
log.info("Directory created");
} catch (SecurityException e) {
log.info(e.getMessage());
throw new RuntimeException(e);
}
}

final File saveConfig = new File(lastSavedNews);
if (!saveConfig.exists()) {
log.info("Creating file: " + lastSavedNews);
try {
if (!saveConfig.createNewFile()) {
throw new RuntimeException("Can't create file: " + dirPath);
}
log.info("File created");
} catch (IOException e) {
log.info(e.getMessage());
throw new RuntimeException(e);
}
}

try {
final FileReader fr = new FileReader(lastSavedNews);
final BufferedReader reader = new BufferedReader(fr);
LocalDateTime pubDate = null;
String pubTitle = null;
try {
DateTimeFormatter formatter = DateTimeFormatter.ISO_DATE_TIME;
pubDate = LocalDateTime.parse(reader.readLine(), formatter);
pubTitle = reader.readLine();
} catch (IOException e) {
log.info(e.getMessage());
} catch (NullPointerException e) {
log.info("pubDate or pubTitle is null");
}

if (pubDate == null || pubTitle == null ||
(pubDate.isBefore(news.getPubDate()) ||
(pubDate.isEqual(news.getPubDate()) && !pubTitle.equals(news.getTitle())))) {
int curNameOfFile = 0;
final File[] files = theDir.listFiles();
if (files == null) {
throw new RuntimeException("Can't get list files");
}

if (prevName < 0) {
for (File file : files) {
String name = file.getName();
int num;
try {
num = Integer.parseInt(name.substring(0, name.lastIndexOf(".")));
} catch (NumberFormatException | StringIndexOutOfBoundsException e) {
continue;
}
if (num > curNameOfFile) {
curNameOfFile = num;
}
}
} else {
curNameOfFile = prevName;
}

curNameOfFile++;
try {
final String filePath = dirPath + String.valueOf(curNameOfFile) + ".xml";
final FileWriter writer = new FileWriter(filePath, false);
writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
writer.append('\n');
writer.write("<item>");
writer.append('\n');
writer.write("<title>" + news.getTitle() + "</title>");
writer.append('\n');
writer.write("<text>" + news.getText() + "</text>");
writer.append('\n');
writer.write("<category>" + news.getCategory() + "</category>");
writer.append('\n');
writer.write("</item>");
writer.flush();
} catch (IOException e) {
log.info(e.getMessage());
throw new RuntimeException(e);
}

try {
final FileWriter writer = new FileWriter(lastSavedNews, false);
writer.write(news.getPubDate().toString());
writer.append('\n');
writer.write(news.getTitle());
writer.append('\n');
writer.flush();
} catch (IOException e) {
log.info(e.getMessage());
throw new RuntimeException(e);
}

prevName = curNameOfFile;
}
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
}


}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package com.spbsu.flamestream.flamenews.lenta;

import com.spbsu.flamestream.flamenews.lenta.model.News;

interface SaveToFile {
void save(News news);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package com.spbsu.flamestream.flamenews.lenta.model;

import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.XmlTransient;
import java.util.LinkedList;
import java.util.List;

@XmlRootElement(name = "channel")
public class Channel {

@XmlElement(name = "language")
private String language;
@XmlElement(name = "title")
private String title;
@XmlElement(name = "description")
private String description;
@XmlElement(name = "link")
private String link;
@XmlElement(name = "item")
private List<Item> items = new LinkedList<>();

@XmlTransient
public String getLanguage() {
return language;
}

public void setLanguage(String language) {
this.language = language;
}

@XmlTransient
public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

@XmlTransient
public String getDescription() {
return description;
}

public void setDescription(String description) {
this.description = description;
}
@XmlTransient
public String getLink() {
return link;
}

public void setLink(String link) {
this.link = link;
}

@XmlTransient
public List<Item> getItems() {
return items;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package com.spbsu.flamestream.flamenews.lenta.model;

import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.XmlTransient;

@XmlRootElement(name = "item")
public class Item {

@XmlElement(name = "title")
private String title;
@XmlElement(name = "link")
private String link;
@XmlElement(name = "description")
private String description;
@XmlElement(name = "pubDate")
private String pubDate;
@XmlElement(name = "category")
private String category;

@XmlTransient
public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

@XmlTransient
public String getLink() {
return link;
}

public void setLink(String link) {
this.link = link;
}

@XmlTransient
public String getDescription() {
return description;
}

public void setDescription(String description) {
this.description = description;
}

@XmlTransient
public String getPubDate() {
return pubDate;
}

public void setPubDate(String pubDate) {
this.pubDate = pubDate;
}

@XmlTransient
public String getCategory() {
return category;
}

public void setCategory(String category) {
this.category = category;
}

}
Loading