Email analyzer in python



#include <iostream>
#include <string>
#include <algorithm>
#include <sstream>
#include <set>
#include <fstream>
#include <unordered_map>
#include <map>

//#include "stdafx.h"
//#include "SmartlyParser.h"


typedef std::vector<std::set<std::string>> setWords;

class SmartAnalyzer
{
private:
    double sentence_intersection(std::set<std::string> const& a, std::set<std::string> const& b);
    std::vector<std::string> parseSentences(std::string const& text);
    setWords stringSets(std::string const& text);
    std::string format(std::string text, bool const& includeDot = false);

public:
    SmartAnalyzer() {}
    std::string getSummary(std::string title, std::string const& text, int const& limit);
};



double SmartAnalyzer::sentence_intersection(std::set<std::string> const& a, std::set<std::string> const& b)
{
    std::vector<std::string> common;
    std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), std::back_inserter(common));
    return (double) common.size() / ((a.size() + b.size()) / 2);
}

std::vector<std::string> SmartAnalyzer::parseSentences(std::string const& text)
{
    std::vector<std::string> output;
    std::istringstream iss(text);
    std::string token;

    while (std::getline(iss, token, '.')) {
        output.push_back(token);
    }

    return output;
}

setWords SmartAnalyzer::stringSets(std::string const& text)
{
    setWords output;
    std::istringstream iss(text), current;
    std::string token;

    while (std::getline(iss, token, '.')) {
        current.clear();
        current.str(token);
        output.push_back(std::set<std::string>((std::istream_iterator<std::string>(current)), std::istream_iterator<std::string>()));
    }

    return output;
}

std::string SmartAnalyzer::format(std::string text, bool const& includeDot)
{
    text.erase(std::remove_if(text.begin(), text.end(), [includeDot](char c) { return c == ',' || c == '!' || c == '"' || (includeDot && c == '.' ); }), text.end());
    std::transform(text.begin(), text.end(), text.begin(), ::tolower);
    return text;
}

std::string SmartAnalyzer::getSummary(std::string title, std::string const& text, int const& limit)
{
    std::vector<std::string> sentences = parseSentences(text);
    int sentLen = sentences.size();

    setWords sentencesC = stringSets(format(text));
    std::set<std::string> titles = std::set<std::string>((std::istream_iterator<std::string>(std::istringstream(format(title, true)))), std::istream_iterator<std::string>());

    double sum;
    std::map<double, int> sentencesWeight;

    for (int i = 0; i < sentLen; i++)
    {
        sum = 0;
        for (int j = 0; j < sentLen; j++)
        {
            if (i == j) { /* find intersection with the title, instead of self */
                sum += sentence_intersection(sentencesC[i], titles) * 2;
                continue;
            }
            sum += sentence_intersection(sentencesC[i], sentencesC[j]);
        }
        sentencesWeight.insert({ sum, i });
    }

    std::string output;

    for (std::map<double, int>::reverse_iterator it = sentencesWeight.rbegin(); it != sentencesWeight.rend(); ++it)
    {
        if (std::string(sentences[it->second] + output).size() > limit) {
            break;
        }
        output += sentences[it->second] + ".";
    }

    if (output.empty())
    {
        output = sentences[sentencesWeight.rbegin()->second];
        std::size_t pos = output.size();

        while ((pos = output.rfind(',', pos)) != std::string::npos && output.size() > limit)
        {
            output = output.substr(0, pos);
            pos--;
        }

        output += '.';
    }

    return output;
}

int _tmain()
{
    std::ifstream ifs("F:\\Analyzer\\text.txt");
    std::string content((std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()));

    std::string title = "Stack, overflow";

    SmartAnalyzer a;
    std::cout << a.getSummary(title, content, 5 * 36);

    getchar();
    return 0;
}

Comments

Popular Posts