/*
 * Decompiled with CFR 0.152.
 */
package org.knowm.datasets.reuters21578;

import com.google.common.base.Joiner;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.knowm.datasets.reuters21578.Reuters21578;
import org.knowm.datasets.reuters21578.Reuters21578DAO;

public class RawData2DB {
    int maxBodyLength = 0;

    public static void main(String[] args) throws IOException {
        Reuters21578DAO.init((String[])args);
        Reuters21578DAO.dropTable();
        Reuters21578DAO.createTable();
        RawData2DB dp = new RawData2DB();
        dp.go();
        Reuters21578DAO.release();
    }

    private void go() throws IOException {
        File dir = new File("./raw/");
        WildcardFileFilter fileFilter = new WildcardFileFilter("*.sgm");
        File[] files = dir.listFiles((FileFilter)fileFilter);
        int errors = 0;
        for (int f = 0; f < files.length; ++f) {
            System.out.println("file " + f + " of " + files.length);
            System.out.println("fileNames[f]: " + files[f]);
            List<Reuters21578> stories = this.getReuters21578StoriesFromFile(files[f]);
            for (Reuters21578 reuters21578Story : stories) {
                if (Reuters21578DAO.insert(reuters21578Story) >= 0) continue;
                System.out.println("errors=" + ++errors);
            }
        }
        System.out.println("maxBodyLength = " + this.maxBodyLength);
    }

    private List<Reuters21578> getReuters21578StoriesFromFile(File file) throws IOException {
        ArrayList<Reuters21578> stories = new ArrayList<Reuters21578>();
        String s = FileUtils.readFileToString((File)file, (String)"UTF-8");
        List<String> storiesAsString = this.extractElementAsLines(s, "REUTERS");
        for (String storyAsString : storiesAsString) {
            Reuters21578 reuters21578Story = this.getReuters21578StoryFromText(storyAsString);
            stories.add(reuters21578Story);
        }
        return stories;
    }

    private Reuters21578 getReuters21578StoryFromText(String storyText) {
        String firstLine = storyText.split(System.getProperty("line.separator"))[0];
        String newIdString = this.extractAttribute(firstLine, "NEWID");
        int newId = Integer.parseInt(newIdString);
        String oldIdString = this.extractAttribute(firstLine, "OLDID");
        int oldId = Integer.parseInt(oldIdString);
        String topicsString = this.extractAttribute(firstLine, "TOPICS");
        boolean topicsBool = topicsString.equalsIgnoreCase("YES");
        String lewissplitString = this.extractAttribute(firstLine, "LEWISSPLIT");
        String cgisplitString = this.extractAttribute(firstLine, "CGISPLIT");
        String dateString = this.extractTextBetweenTags(storyText, "DATE");
        Date date = null;
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss.SS");
            date = sdf.parse(dateString);
        }
        catch (ParseException e) {
            e.printStackTrace();
        }
        String topicsText = this.extractTextBetweenTags(storyText, "TOPICS");
        List<String> topicsArray = this.extractElementAsLines(topicsText, "D");
        String topicsCSV = Joiner.on((String)",").join(topicsArray);
        String placesText = this.extractTextBetweenTags(storyText, "PLACES");
        List<String> placesArray = this.extractElementAsLines(placesText, "D");
        String placesCSV = Joiner.on((String)",").join(placesArray);
        String peopleText = this.extractTextBetweenTags(storyText, "PEOPLE");
        List<String> peopleArray = this.extractElementAsLines(peopleText, "D");
        String peopleCSV = Joiner.on((String)",").join(peopleArray);
        String orgsText = this.extractTextBetweenTags(storyText, "ORGS");
        List<String> orgsArray = this.extractElementAsLines(orgsText, "D");
        String orgsCSV = Joiner.on((String)",").join(orgsArray);
        String exchangesText = this.extractTextBetweenTags(storyText, "EXCHANGES");
        List<String> exchangesArray = this.extractElementAsLines(exchangesText, "D");
        String exchangesCSV = Joiner.on((String)",").join(exchangesArray);
        String companiesText = this.extractTextBetweenTags(storyText, "COMPANIES");
        List<String> companiesArray = this.extractElementAsLines(companiesText, "D");
        String companiesCSV = Joiner.on((String)",").join(companiesArray);
        String titleText = this.extractTextBetweenTags(storyText, "TITLE");
        String datelineText = this.extractTextBetweenTags(storyText, "DATELINE");
        String body = this.extractTextBetweenTags(storyText, "BODY").replaceAll("\\s+", " ");
        if (body.length() > this.maxBodyLength) {
            this.maxBodyLength = body.length();
        }
        Reuters21578 reuters21578 = new Reuters21578();
        reuters21578.setNewid(newId);
        reuters21578.setOldid(oldId);
        reuters21578.setTopicsbool(topicsBool);
        reuters21578.setLewissplit(lewissplitString);
        reuters21578.setCgisplit(cgisplitString);
        reuters21578.setDate(date);
        reuters21578.setTopics(topicsCSV);
        reuters21578.setPlaces(placesCSV);
        reuters21578.setPeople(peopleCSV);
        reuters21578.setOrgs(orgsCSV);
        reuters21578.setExchanges(exchangesCSV);
        reuters21578.setCompanies(companiesCSV);
        reuters21578.setTitle(titleText);
        reuters21578.setDateline(datelineText);
        reuters21578.setBody(body);
        return reuters21578;
    }

    protected List<String> extractElementAsLines(String stringContainingText, String tagname) {
        String openTag = "<" + tagname.toUpperCase();
        String closeTag = "</" + tagname.toUpperCase();
        ArrayList<String> cuts = new ArrayList<String>();
        StringBuilder buf = new StringBuilder();
        boolean record = false;
        for (int i = 0; i < stringContainingText.length() - openTag.length(); ++i) {
            if (stringContainingText.substring(i, i + closeTag.length()).equalsIgnoreCase(closeTag)) {
                cuts.add(buf.toString());
                record = false;
            }
            if (record) {
                buf.append(stringContainingText.charAt(i));
            }
            if (!stringContainingText.substring(i, i + openTag.length()).equalsIgnoreCase(openTag)) continue;
            buf = new StringBuilder();
            i += openTag.length();
            record = true;
        }
        return cuts;
    }

    protected String extractTextBetweenTags(String stringContainingText, String tagname) {
        String openTag = "<" + tagname.toUpperCase() + ">";
        String closeTag = "</" + tagname.toUpperCase() + ">";
        StringBuilder buf = new StringBuilder();
        boolean record = false;
        for (int i = 0; i < stringContainingText.length() - openTag.length(); ++i) {
            if (stringContainingText.substring(i, i + closeTag.length()).equalsIgnoreCase(closeTag)) {
                record = false;
                break;
            }
            if (record) {
                buf.append(stringContainingText.charAt(i));
            }
            if (!stringContainingText.substring(i, i + openTag.length()).equalsIgnoreCase(openTag)) continue;
            buf = new StringBuilder();
            i += openTag.length() - 1;
            record = true;
        }
        return buf.toString().trim();
    }

    protected String extractAttribute(String stringContainingAttributes, String attributeName) {
        String attributeValue = "";
        stringContainingAttributes = stringContainingAttributes.replaceAll("<", "").replaceAll(">", "");
        String[] keyValues = stringContainingAttributes.split(" ");
        for (int i = 0; i < keyValues.length; ++i) {
            String keyValue = keyValues[i].trim();
            String[] keyAndValue = keyValue.split("=");
            if (!keyAndValue[0].equalsIgnoreCase(attributeName)) continue;
            return keyAndValue[1].substring(1, keyAndValue[1].length() - 1);
        }
        return attributeValue;
    }
}

