Source code for systematic_review.search_count

"""Module: search_count
This module contains all necessary functions for searching the citations, articles text and count number of search_words_object
present.
"""

import pandas as pd

from typing import List, Union, Dict, Any

from systematic_review import string_manipulation, validation
from systematic_review import converter


[docs]def remove_duplicates_keywords_from_next_groups(preprocessed_clean_grouped_keywords_dict: dict) -> dict:
    """Execute search_words_object step.
    This takes search_words_object from {keyword_group_name: search_words_object,...} dict and remove symbols with spaces. it then convert
    them to lowercase and remove any duplicate keyword inside of search_words_object. outputs the {keyword_group_name:
    [clean_keywords],...} and then Remove duplicate instances of search_words_object in other search_words_object groups.

    Parameters
    ----------
    preprocessed_clean_grouped_keywords_dict : dict
        This is output dictionary which contains processed non-duplicate search_words_object dict.
        Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2':
        ["corporate", "pricing", "risk"],...}

    Returns
    -------
    dict
        This is the dictionary comprised of unique search_words_object in each keyword groups. It means keyword from first keyword
        group can not be found in any other keyword group.
        Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2':
        ["corporate", "pricing"],...}
        'risk' is removed from keyword_group_2.

    """
    temp_set = set()
    temp_preprocessed_clean_grouped_keywords_dict = preprocessed_clean_grouped_keywords_dict.copy()
    for keyword_group_name, grouped_unique_keywords in preprocessed_clean_grouped_keywords_dict.items():
        # appending new search_words_object in temp_set
        for keywords in grouped_unique_keywords:
            if keywords not in temp_set:
                temp_set.add(keywords)
            else:
                temp_preprocessed_clean_grouped_keywords_dict[keyword_group_name].remove(keywords)
    return temp_preprocessed_clean_grouped_keywords_dict


[docs]class SearchWords:
    """This class contains all functionalities related to search words.

    """
    sample_dict = {'keywords_finance': 'Management investing corporate pricing risk',
                   'keywords_machine_learning': 'neural fuzzy inference system artificial intelligence artificial '
                                                'computational neural networks',
                   'keywords_common_words': 'accuracy classification cross sectional cross-section expected metrics '
                                            'prediction predict expert system'}

    def __init__(self, search_words,
                 text_manipulation_method_name: str = "preprocess_string", custom_text_manipulation_function=None,
                 default_search_words_group_name: str = "search_words_group_", all_unique_keywords: bool = False,
                 unique_keywords: bool = True, *args, **kwargs):

        self.args = args
        self.kwargs = kwargs

        self.all_unique_keywords = all_unique_keywords
        self.default_search_words_group_name = default_search_words_group_name
        self.custom_text_manipulation_function = custom_text_manipulation_function
        self.unique_keywords = unique_keywords
        self.text_manipulation_method_name = text_manipulation_method_name
        if type(search_words) == str:
            self.search_words_path = search_words
            self.value = self.preprocess_searched_keywords(converter.json_file_to_dict(self.search_words_path))
        elif type(search_words) == list:
            self.search_word_list = search_words
            self.value = self.preprocess_searched_keywords(self.construct_search_words_from_list())
        elif type(search_words) == dict:
            self.search_word_dict = search_words
            self.value = self.preprocess_searched_keywords(self.search_word_dict)
        else:
            print(f"search_words type {type(search_words)} is incorrect, It must be str, list, or dict.")

[docs]    def get_sample_search_words_json(self, output_file_path: str = "sample_search_words_template.json") -> None:
        """Outputs the json sample search_words_object file template as example which can be edited by user to upload
        search_words_object.

        Parameters
        ----------
        output_file_path : str
            this is optional output file path for json template

        Returns
        -------
        None
            function create the file on the root folder unless specified in output_file_path

        """

        converter.write_json_file_with_dict(output_file_path, self.sample_dict)

[docs]    def unique_keywords_in_preprocessed_clean_keywords_dict(self) -> set:
        """Return set of unique search_words_object from the preprocessed_clean_keywords_dict.

        Returns
        -------
        set
            This is set of unique search_words_object from all of search_words_object groups.

        """

        unique_keywords = set()
        for keywords_list in self.value.values():
            for keywords in keywords_list:
                unique_keywords.add(keywords)
        return unique_keywords

[docs]    def construct_search_words_from_list(self) -> dict:
        """
        This takes keywords_list which contains search_words_object as ['keyword1 keyword2 keyword3',
        'keyword1 keyword2'] and function construct dict as {'keyword_group_1': 'keyword1 keyword2 keyword3',
        'keyword_group_2': 'keyword1 keyword2'}

        Returns
        -------
        dict
            the dictionary contains the group name and search_words_object paired as value
            Examples - {'keyword_group_1': 'keyword1 keyword2 keyword3', 'keyword_group_2': 'keyword1 keyword2'}

        """

        suffix = 1
        grouped_keywords_dictionary = {}
        for keywords in self.search_word_list:
            dictionary_key = self.default_search_words_group_name + str(suffix)
            grouped_keywords_dictionary[dictionary_key] = keywords

            suffix += 1
        return grouped_keywords_dictionary

[docs]    def preprocess_search_keywords_dictionary(self, grouped_keywords_dictionary: dict) -> dict:
        """This takes search_words_object from {keyword_group_name: search_words_object,...} dict and remove symbols
        with spaces. it then convert them to lowercase and remove any duplicate keyword inside of search_words_object.
        outputs the {keyword_group_name: [clean_keywords],...}

        Parameters
        ----------
        grouped_keywords_dictionary : dict
            This is the input dictionary of search_words_object used for systematic review.
            Example - {'keyword_group_name': "Management investing corporate pricing risk Risk Pre-process",...}

        Returns
        -------
        dict
            This is output dictionary which contains processed non-duplicate search_words_object dict.
            Example - {'keyword_group_name': ["management", "investing", "corporate", "pricing", "risk", "pre",
            "process"],...}

        """

        preprocessed_clean_grouped_keywords_dictionary = {}
        for keyword_group_name, keywords in grouped_keywords_dictionary.items():
            preprocessed_string = string_manipulation.text_manipulation_methods(keywords,
                                                                                self.text_manipulation_method_name,
                                                                                self.custom_text_manipulation_function,
                                                                                self.args, self.kwargs)
            preprocessed_clean_keywords = string_manipulation.split_words_remove_duplicates(
                preprocessed_string.split()) if \
                self.unique_keywords else preprocessed_string.split()
            preprocessed_clean_grouped_keywords_dictionary[keyword_group_name] = preprocessed_clean_keywords
        return preprocessed_clean_grouped_keywords_dictionary

[docs]    def preprocess_searched_keywords(self, grouped_keywords_dictionary: dict) -> dict:
        """Remove duplicate instances of search_words_object in other search_words_object groups.

        Parameters
        ----------
        grouped_keywords_dictionary : dict
            This is the input dictionary of search_words_object used for systematic review.
            Example - {'keyword_group_name': "Management investing corporate pricing risk Risk Pre-process",...}

        Returns
        -------
        dict
            This is the dictionary comprised of unique search_words_object in each keyword groups. It means keyword from first keyword
            group can not be found in any other keyword group.
            Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2':
            ["corporate", "pricing"],...}
            'risk' is removed from keyword_group_2.

        """

        preprocessed_keywords = self.preprocess_search_keywords_dictionary(grouped_keywords_dictionary)
        preprocessed_clean_grouped_keywords_dict = remove_duplicates_keywords_from_next_groups(
            preprocessed_keywords) if \
            self.all_unique_keywords else preprocessed_keywords
        return preprocessed_clean_grouped_keywords_dict

[docs]    def creating_default_keyword_count_dict(self):
        """Initialise keyword count dict with value 0 for every keyword.

        Returns
        -------
        dict
            This contains key as keyword and value as 0.

        """

        keyword_count_dict = {"total_keywords": 0}
        for group_name, keywords_list in self.value.items():
            group_name_count = str(group_name) + "_count"
            keyword_count_dict.update({group_name_count: 0})
            for keyword in keywords_list:
                keyword_count_dict.update({keyword: 0})
        return keyword_count_dict

[docs]    def get_sorting_keywords_criterion_list(self) -> List[str]:
        """This sorting criteria list is based on the search_words_object got from the main input search_words_object.
        It contains total_keywords, group_keywords_counts, keywords_counts.

        Returns
        -------
        List[str]
            This is the sorting criterion list which contains column in logical manner we desire. It contains
            total_keywords, group_keywords_counts, and search_words_object in the last.

        """

        sorting_keywords_criterion_list = ["total_keywords"]
        for keyword_group_name in self.value.keys():
            group_name_count = str(keyword_group_name) + "_count"
            sorting_keywords_criterion_list.append(group_name_count)

        for keywords_list in self.value.values():
            for keyword in keywords_list:
                sorting_keywords_criterion_list.append(keyword)
        return sorting_keywords_criterion_list

[docs]    def generate_keywords_count_dictionary(self, text):
        """

        Parameters
        ----------
        text

        Returns
        -------


        """
        empty_keyword_count_dict = self.creating_default_keyword_count_dict()

        total_keywords_counts = 0
        for searched_word in text.split():
            # checking the word in grouped search_words_object and add to full_keywords_count_dict.
            for keyword_group_name, unique_keywords in self.value.items():
                if searched_word in unique_keywords:
                    total_keywords_counts += 1
                    group_name_count = str(keyword_group_name) + "_count"
                    empty_keyword_count_dict[group_name_count] += 1
                    empty_keyword_count_dict[searched_word] += 1

        empty_keyword_count_dict["total_keywords"] = total_keywords_counts

        return empty_keyword_count_dict


[docs]class SearchCount:
    """Used to search search_words in citations and research papers. This can output both records list and
    pandas.DataFrame as well as can take both inputs.

    """

    download_flag_column_name = 'downloaded'
    research_paper_file_location_column_name = 'file location'
    citation_text_column_name = "citation_text"

    def __init__(self, data: Union[List[dict], pd.DataFrame], search_words_object: SearchWords,
                 text_manipulation_method_name: str = "preprocess_string",
                 custom_text_manipulation_function=None, *args, **kwargs):
        """Set up all necessary data for start counting.

        Parameters
        ----------
        data : Union[List[dict], pd.DataFrame]
            This dataframe contains all columns with counts of search_words_object.
        search_words_object : search_count.SearchWords
            search_words_object should contain dictionary comprised of unique search_words_object in each keyword
            groups. It means keyword from first keyword group can not be found in any other keyword group.
            Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2':
            ["corporate", "pricing"],...}
        text_manipulation_method_name : str
            provides the options to use any text manipulation function.
            preprocess_string (default and applied before all other implemented functions)
            custom_text_manipulation_function - for putting your custom_text_manipulation_function function to
            preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer,
            nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma,
            convert_string_to_lowercase, preprocess_string_to_space_separated_words
        custom_text_manipulation_function : function
            This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as
            custom_text_manipulation_function = function_name. it will take text as parameter with no default
            preprocess_string operation.
        kwargs : Dict[str, Any]
            These key = word or {key: word} arguments are for custom_text_manipulation_function
        args : Tuple[Any, Any]
            These arguments are for custom_text_manipulation_function

        """

        self.args = args
        self.kwargs = kwargs
        self.custom_text_manipulation_function = custom_text_manipulation_function
        self.data = converter.dataframe_to_records_list(data) if type(data) == pd.DataFrame else data
        self.text_manipulation_method_name = text_manipulation_method_name
        self.search_words_object = search_words_object

[docs]    def counts(self) -> List[Dict[str, Any]]:
        """This takes records list and return search counts based on type of citation data or research papers data.

        Returns
        -------
        List[Dict[str, Any]]
            records list containing the citation data or research papers data.

        """
        if (self.download_flag_column_name in self.data[0]) and (
                self.research_paper_file_location_column_name in self.data[0]):
            return self.count_search_words_in_research_paper_text(self.data)
        else:
            return self.count_search_words_in_citations_text(self.data)

[docs]    def count_search_words_in_citations_text(self, citations_records_list: List[Dict[str, Any]]
                                             ) -> List[Dict[str, Any]]:
        """Loop over each citations to count search words (SearchWords instance) in citation data.

        Parameters
        ----------
        citations_records_list : List[Dict[str, Any]]
            This list contains all the citations details with column named 'full_text' containing full text like
            article name, abstract and keyword.

        Returns
        -------
        List[Dict[str, Any]]
            This is the list of all citations search result which contains our all search_words_object count.
            Examples - [{'title': 'name', 'total_keywords': count, 'keyword_group_1_count': count,
            "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
            "pricing": count,...}]

        """
        final_list_of_full_search_words_counts_citations_dict = []

        # iterating through each citation details one by one.
        for citation_dict in citations_records_list:
            # changing the text string based on text manipulation text_manipulation_method_name name
            text = string_manipulation.text_manipulation_methods(citation_dict[self.citation_text_column_name],
                                                                 self.text_manipulation_method_name,
                                                                 self.custom_text_manipulation_function,
                                                                 self.args, self.kwargs)

            # taking words one by one from full_text of citation.
            search_words_counts_dict = self.search_words_object.generate_keywords_count_dictionary(text)
            # adding citations with search_words_counts
            full_search_words_counts_dict = {**citation_dict, **search_words_counts_dict}
            # putting citation record in final_list_of_full_search_words_counts_citations_dict
            final_list_of_full_search_words_counts_citations_dict.append(full_search_words_counts_dict)

        return final_list_of_full_search_words_counts_citations_dict

[docs]    def count_search_words_in_research_paper_text(self, research_papers_records_list: List[Dict[str, Any]]
                                                  ) -> List[Dict[str, Any]]:
        """Loop over validated research paper to count search words (SearchWords instance) in research papers data.

        Parameters
        ----------
        research_papers_records_list : List[Dict[str, Any]]
            This list contains data of all the research papers files contained in directory_path.

        Returns
        -------
        List[Dict[str, Any]]
            This is the list of all citations search result which contains our all search_words_object count.
            Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count,
            "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
            "pricing": count,...}]

        """

        final_list_of_full_search_words_counts_citations_dict = []

        # iterating through each citation details one by one.
        for research_papers_record in research_papers_records_list:
            # changing the text string based on text manipulation text_manipulation_method_name name

            if research_papers_record[self.download_flag_column_name] != "yes":
                continue
            research_paper_text = converter.Reader(
                research_papers_record[self.research_paper_file_location_column_name]).get_text()

            text = string_manipulation.text_manipulation_methods(research_paper_text,
                                                                 self.text_manipulation_method_name,
                                                                 self.custom_text_manipulation_function,
                                                                 self.args, self.kwargs)

            # taking words one by one from full_text of citation.
            search_words_counts_dict = self.search_words_object.generate_keywords_count_dictionary(text)
            # adding citations with search_words_counts
            full_search_words_counts_dict = {**research_papers_record, **search_words_counts_dict}
            # putting citation record in final_list_of_full_search_words_counts_citations_dict
            final_list_of_full_search_words_counts_citations_dict.append(full_search_words_counts_dict)

        return final_list_of_full_search_words_counts_citations_dict

[docs]    def get_records_list(self) -> List[Dict[str, Any]]:
        """Outputs the records list containing counts results of input data.

        Returns
        -------
        List[Dict[str, Any]]
            This is the list of records which contains all search_words_object count from input data.
            Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count,
            "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
            "pricing": count,...}]

        """
        return self.counts()

[docs]    def get_dataframe(self):
        """Outputs the pandas.DataFrame containing counts results of input data.

        Returns
        -------
        pandas.DataFrame
            This is the dataframe of all citations search result which contains our all search_words_object count.
            Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count,
            "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
            "pricing": count,...}]

        """
        return converter.records_list_to_dataframe(self.counts())

[docs]    def to_csv(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
        """This function saves pandas.DataFrame to csv file.

        Parameters
        ----------
        output_filename : str
            This is the name of output file which should contains .csv extension
        index : bool
            Define if index is needed in output csv file or not.

        Returns
        -------

        """
        converter.dataframe_to_csv_file(self.get_dataframe(), output_filename, index)

[docs]    def to_excel(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
        """This function saves pandas.DataFrame to excel file.

        Parameters
        ----------
        output_filename : str
            This is the name of output file which should contains .xlsx extension
        index : bool
            Define if index is needed in output excel file or not.

        Returns
        -------

        """
        converter.dataframe_to_excel_file(self.get_dataframe(), output_filename, index)


[docs]def adding_dict_key_or_increasing_value(input_dict: dict, dict_key: str, step: int = 1, default_dict_value: int = 1):
    """Increase the value of dict(key:value) by step using key. If key not present then it get initialised with default
    dict value

    Parameters
    ----------
    input_dict : dict
        This is the dictionary which we want to modify.
    dict_key : str
        This is the key of dictionary
    step : int
        This is the addition number by which value of dictionary needed to be increased.
    default_dict_value : int
        If key is not available in dictionary then this default value is used to add new key.

    Returns
    -------
    dict
        This is the modified dictionary

    """
    if dict_key in input_dict.keys():
        input_dict[dict_key] += step
    else:
        input_dict[dict_key] = default_dict_value
    return input_dict


[docs]def count_words_in_list_of_lists(list_of_lists: List[list]) -> dict:
    """count words in list containing other lists with words.

    Parameters
    ----------
    list_of_lists : List[list]
        This list contains each element of type list.

    Returns
    -------
    dict
        dictionary with key as words and value as counts

    """
    dict_with_words_count = {}
    for keyword_list in list_of_lists:
        for keyword in keyword_list:
            clean_keyword = string_manipulation.preprocess_string(keyword)
            dict_with_words_count = adding_dict_key_or_increasing_value(dict_with_words_count, clean_keyword)

    return dict_with_words_count


[docs]def count_keywords_in_citations_full_text_list(citations_with_fulltext_list: list,
                                               unique_preprocessed_clean_grouped_keywords_dict: dict,
                                               title_column_name: str = "title",
                                               method: str = "preprocess_string", custom=None) -> list:
    """Loop over articles to calculate search_words_object counts

    Parameters
    ----------
    custom : function
        This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as custom_text_manipulation_function = function_name. it will
        take text as parameter with no default preprocess_string operation.
    method : str
        provides the options to use any text manipulation function.
        preprocess_string (default and applied before all other implemented functions)
        custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text
        nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer,
        nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase
    citations_with_fulltext_list : list
        This list contains all the citations details with column named 'full_text' containing full text like
        article name, abstract and keyword.
    unique_preprocessed_clean_grouped_keywords_dict : dict
        looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"],
                  'keyword_group_2': ["corporate", "pricing"],...}
    title_column_name : str
        This is the name of column which contain citation title

    Returns
    -------
    list
        This is the list of all citations search result which contains our all search_words_object count.
        Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]

    """
    final_list_of_full_keywords_counts_citations_dict = []
    keyword_count_dict = creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict)
    # iterating through each citation details one by one.
    for citation_dict in citations_with_fulltext_list:
        print(f"article: {citation_dict[title_column_name]}")
        full_keywords_counts_dict = citation_dict
        full_keywords_counts_dict.update(keyword_count_dict)

        total_keywords_counts = 0
        citation_full_text = string_manipulation.text_manipulation_methods(citation_dict['citation_text'], method, custom).split()
        # taking words one by one from full_text of citation.
        for searched_word in citation_full_text:
            # checking the word in grouped search_words_object and add to full_keywords_count_dict.
            for keyword_group_name, unique_keywords in unique_preprocessed_clean_grouped_keywords_dict.items():
                if searched_word in unique_keywords:
                    total_keywords_counts += 1
                    group_name_count = str(keyword_group_name) + "_count"
                    full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict,
                                                                                    group_name_count)
                    full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict,
                                                                                    searched_word)

        full_keywords_counts_dict.update({"total_keywords": total_keywords_counts})
        final_list_of_full_keywords_counts_citations_dict.append(full_keywords_counts_dict)

    return final_list_of_full_keywords_counts_citations_dict


[docs]def creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict: dict):
    """Initialise keyword count dict with value 0 for every keyword.

    Parameters
    ----------
    unique_preprocessed_clean_grouped_keywords_dict : dict
        looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"],
                  'keyword_group_2': ["corporate", "pricing"],...}

    Returns
    -------
    dict
        This contains key as keyword and value as 0.

    """
    keyword_count_dict = {}
    for group_name, keywords_list in unique_preprocessed_clean_grouped_keywords_dict.items():
        group_name_count = str(group_name) + "_count"
        keyword_count_dict.update({group_name_count: 0})
        for keyword in keywords_list:
            keyword_count_dict.update({keyword: 0})
    return keyword_count_dict


[docs]def count_search_words_in_citations_text(citations_with_fulltext_list: list,
                                         search_words_object: SearchWords,
                                         text_column_name: str = "'citation_text'",
                                         text_manipulation_method_name: str = "preprocess_string", custom=None,
                                         custom_text_manipulation_function=None, *args, **kwargs) -> list:
    """Loop over articles to calculate search_words_object counts

    Parameters
    ----------
    custom_text_manipulation_function : function
        This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as custom_text_manipulation_function = function_name. it will
        take text as parameter with no default preprocess_string operation.
    text_manipulation_method_name : str
        provides the options to use any text manipulation function.
        preprocess_string (default and applied before all other implemented functions)
        custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text
        nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer,
        nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase
    citations_with_fulltext_list : list
        This list contains all the citations details with column named 'full_text' containing full text like
        article name, abstract and keyword.
    search_words_object : object
        looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"],
                  'keyword_group_2': ["corporate", "pricing"],...}
    text_column_name : str
        This is the name of column which contain citation text

    Returns
    -------
    list
        This is the list of all citations search result which contains our all search_words_object count.
        Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]

    """
    final_list_of_full_search_words_counts_citations_dict = []

    # iterating through each citation details one by one.
    for citation_dict in citations_with_fulltext_list:
        # changing the text string based on text manipulation text_manipulation_method_name name
        text = string_manipulation.text_manipulation_methods(citation_dict[text_column_name],
                                                             text_manipulation_method_name,
                                                             custom_text_manipulation_function,
                                                             args, kwargs)

        # taking words one by one from full_text of citation.
        search_words_counts_dict = search_words_object.generate_keywords_count_dictionary(text)
        # adding citations with search_words_counts
        full_search_words_counts_dict = {**citation_dict, **search_words_counts_dict}
        # putting citation record in final_list_of_full_search_words_counts_citations_dict
        final_list_of_full_search_words_counts_citations_dict.append(full_search_words_counts_dict)

    return final_list_of_full_search_words_counts_citations_dict


[docs]def citation_list_of_dict_search_count_to_df(citations_list: list, keywords: dict, title_column_name: str = "title",
                                             method: str = "preprocess_string", custom=None) -> pd.DataFrame:
    """Loop over articles to calculate search_words_object counts and return dataframe.

    Parameters
    ----------
    title_column_name : str
        This is the name of column which contain citation title
    custom : function
        This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as
        custom_text_manipulation_function = function_name. it will
        take text as parameter with no default preprocess_string operation.
    method : str
        provides the options to use any text manipulation function.
        preprocess_string (default and applied before all other implemented functions)
        custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text
        nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer,
        nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase
    citations_list : list
        list with additional columns needed for next steps of systematic review and duplicates are removed
    keywords : dict
        This is output dictionary which contains processed non-duplicate search_words_object dict.
        Example - {'keyword_group_name': ["management", "investing", "corporate", "pricing", "risk", "pre",
        "process"],...}

    Returns
    -------
    pandas.DataFrame object
        This is pandas object of all citations search result which contains our all search_words_object count.
        Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]

    """
    citations_keywords_count_list = count_keywords_in_citations_full_text_list(citations_list, keywords,
                                                                               title_column_name, method, custom)
    citation_search_count_df = converter.records_list_to_dataframe(citations_keywords_count_list)
    return citation_search_count_df


[docs]def count_keywords_in_citations_full_text(dataframe_citations_with_fulltext: pd.DataFrame,
                                          unique_preprocessed_clean_grouped_keywords_dict: dict,
                                          title_column_name: str = "title",
                                          method: str = "preprocess_string", custom=None) -> list:
    """Loop over articles to calculate keywords counts

    Parameters
    ----------
    custom : function
        This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will
        take text as parameter with no default preprocess_string operation.
    method : str
        provides the options to use any text manipulation function.
        preprocess_string (default and applied before all other implemented functions)
        custom - for putting your custom function to preprocess the text
        nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer,
        nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase
    dataframe_citations_with_fulltext : pd.DataFrame
        This dataframe contains all the citations details with column named 'full_text' containing full text like
        article name, abstract and keyword.
    unique_preprocessed_clean_grouped_keywords_dict : dict
        looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"],
                  'keyword_group_2': ["corporate", "pricing"],...}
    title_column_name : str
        This is the name of column which contain citation title

    Returns
    -------
    list
        This is the list of all citations search result which contains our all keywords count.
        Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]

    """
    final_list_of_full_keywords_counts_citations_dict = []
    keyword_count_dict = creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict)
    # iterating through each citation details one by one.
    for _, row in dataframe_citations_with_fulltext.iterrows():
        print(f"article: {row[title_column_name]}")
        full_keywords_counts_dict = {title_column_name: str(row[title_column_name])}
        full_keywords_counts_dict.update(keyword_count_dict)

        total_keywords_counts = 0
        citation_full_text = string_manipulation.text_manipulation_methods(row['full_text'], method, custom).split()
        # taking words one by one from full_text of citation.
        for searched_word in citation_full_text:
            # checking the word in grouped keywords and add to full_keywords_count_dict.
            for keyword_group_name, unique_keywords in unique_preprocessed_clean_grouped_keywords_dict.items():
                if searched_word in unique_keywords:
                    total_keywords_counts += 1
                    group_name_count = str(keyword_group_name) + "_count"
                    full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict,
                                                                                    group_name_count)
                    full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict,
                                                                                    searched_word)

        full_keywords_counts_dict.update({"total_keywords": total_keywords_counts})
        final_list_of_full_keywords_counts_citations_dict.append(full_keywords_counts_dict)

    return final_list_of_full_keywords_counts_citations_dict


[docs]def citation_search_count_dataframe(citations_df: pd.DataFrame, keywords: dict, title_column_name: str = "title",
                                    method: str = "preprocess_string", custom=None) -> pd.DataFrame:
    """Loop over articles to calculate keywords counts and return dataframe.

    Parameters
    ----------
    title_column_name : str
        This is the name of column which contain citation title
    custom : function
        This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will
        take text as parameter with no default preprocess_string operation.
    method : str
        provides the options to use any text manipulation function.
        preprocess_string (default and applied before all other implemented functions)
        custom - for putting your custom function to preprocess the text
        nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer,
        nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase
    citations_df : pandas.DataFrame object
        DataFrame with additional columns needed for next steps of systematic review and duplicates are removed
    keywords : dict
        This is output dictionary which contains processed non-duplicate keywords dict.
        Example - {'keyword_group_name': ["management", "investing", "corporate", "pricing", "risk", "pre",
        "process"],...}

    Returns
    -------
    pandas.DataFrame object
        This is pandas object of all citations search result which contains our all keywords count.
        Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]

    """
    citations_keywords_count_list = count_keywords_in_citations_full_text(citations_df, keywords, title_column_name,
                                                                          method, custom)
    citation_search_count_df = converter.records_list_to_dataframe(citations_keywords_count_list)
    return citation_search_count_df


[docs]def count_keywords_in_pdf_full_text(list_of_downloaded_articles_path: list,
                                    unique_preprocessed_clean_grouped_keywords_dict: dict,
                                    title_column_name: str = "cleaned_title_pdf",
                                    method: str = "preprocess_string", custom=None) -> list:
    """Loop over articles pdf files to calculate keywords counts.

    Parameters
    ----------
    custom : function
        This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will
        take text as parameter with no default preprocess_string operation.
    method : str
        provides the options to use any text manipulation function.
        preprocess_string (default and applied before all other implemented functions)
        custom - for putting your custom function to preprocess the text
        nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer,
        nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase
    title_column_name : str
        This is the name of column which contain citation title
    list_of_downloaded_articles_path : list
        This list contains path of all the pdf files contained in directory_path.
    unique_preprocessed_clean_grouped_keywords_dict : dict
        looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"],
                  'keyword_group_2': ["corporate", "pricing"],...}

    Returns
    -------
    list
        This is the list of all citations search result which contains our all keywords count.
        Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]

    """
    final_list_of_full_keywords_counts_pdf_text_dict = []
    keyword_count_dict = creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict)
    # iterating through each pdf path one by one.
    for pdf_path in list_of_downloaded_articles_path:
        article_name = string_manipulation.preprocess_string_to_space_separated_words(
            string_manipulation.pdf_filename_from_filepath(pdf_path))
        print("article: ", article_name)
        full_keywords_counts_dict = {title_column_name: str(article_name)}
        full_keywords_counts_dict.update(keyword_count_dict)
        total_keywords_counts = 0

        try:
            pdf_text = converter.get_text_from_multiple_pdf_reader(pdf_path)
        except FileNotFoundError:
            continue

        pdf_full_text = string_manipulation.text_manipulation_methods(pdf_text, method, custom).split()
        # taking words one by one from full_text of pdf file.
        for searched_word in pdf_full_text:
            # checking the word in grouped keywords and add to full_keywords_count_dict.
            for keyword_group_name, unique_keywords in unique_preprocessed_clean_grouped_keywords_dict.items():
                if searched_word in unique_keywords:
                    total_keywords_counts += 1
                    group_name_count = str(keyword_group_name) + "_count"
                    full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict,
                                                                                    group_name_count)
                    full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict,
                                                                                    searched_word)

        full_keywords_counts_dict.update({"total_keywords": total_keywords_counts})
        final_list_of_full_keywords_counts_pdf_text_dict.append(full_keywords_counts_dict)

    return final_list_of_full_keywords_counts_pdf_text_dict


[docs]def pdf_full_text_search_count_dataframe(list_of_downloaded_articles_path: list,
                                         unique_preprocessed_clean_grouped_keywords_dict: dict,
                                         title_column_name: str = "cleaned_title",
                                         method: str = "preprocess_string", custom=None
                                         ) -> pd.DataFrame:
    """Loop over articles pdf files to calculate keywords counts.

    Parameters
    ----------
    custom : function
        This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will
        take text as parameter with no default preprocess_string operation.
    method : str
        provides the options to use any text manipulation function.
        preprocess_string (default and applied before all other implemented functions)
        custom - for putting your custom function to preprocess the text
        nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer,
        nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase
    title_column_name : str
        This is the name of column which contain citation title
    list_of_downloaded_articles_path : list
        This list contains path of all the pdf files contained in directory_path.
    unique_preprocessed_clean_grouped_keywords_dict : dict
        looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"],
                  'keyword_group_2': ["corporate", "pricing"],...}

    Returns
    -------
    pandas.DataFrame object
        This is the dataframe of all citations search result which contains our all keywords count.
        Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]

    """
    pdf_full_text_keywords_count_list = count_keywords_in_pdf_full_text(list_of_downloaded_articles_path,
                                                                        unique_preprocessed_clean_grouped_keywords_dict,
                                                                        title_column_name, method, custom)
    pdf_full_text_search_count_df = converter.records_list_to_dataframe(pdf_full_text_keywords_count_list)
    return pdf_full_text_search_count_df


[docs]def get_sorting_keywords_criterion_list(unique_preprocessed_clean_grouped_keywords_dict: dict) -> list:
    """This sorting criteria list is based on the keywords got from the main input keywords. It contains total_keywords,
    group_keywords_counts, keywords_counts.

    Parameters
    ----------
    unique_preprocessed_clean_grouped_keywords_dict : dict
        his is the dictionary comprised of unique keywords in each keyword groups. It means keyword from first keyword
        group can not be found in any other keyword group.
        Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2':
        ["corporate", "pricing"],...}.
        'risk' is removed from keyword_group_2.

    Returns
    -------
    list
        This is the sorting criterion list which contains column in logical manner we desire. It contains
        total_keywords, group_keywords_counts, and keywords in the last.

    """
    sorting_keywords_criterion_list = ["total_keywords"]
    for keyword_group_name in unique_preprocessed_clean_grouped_keywords_dict.keys():
        group_name_count = str(keyword_group_name) + "_count"
        sorting_keywords_criterion_list.append(group_name_count)

    for keywords_list in unique_preprocessed_clean_grouped_keywords_dict.values():
        for keyword in keywords_list:
            sorting_keywords_criterion_list.append(keyword)
    return sorting_keywords_criterion_list


[docs]def adding_citation_details_with_keywords_count_in_pdf_full_text(filter_sorted_citations_df: pd.DataFrame,
                                                                 pdf_full_text_search_count: list,
                                                                 unique_preprocessed_clean_grouped_keywords_dict: dict,
                                                                 first_column_name: str = "cleaned_title",
                                                                 second_column_name: str =
                                                                 'cleaned_title_pdf') -> pd.DataFrame:
    """Combining the pdf search_words_object counts with the citation details from filtered and sorted citation full text
    dataframe.

    Parameters
    ----------
    second_column_name : str
        This is the name of column which contain pdf article title.
    first_column_name : str
        This is the name of column which contain citation title.
    filter_sorted_citations_df : pandas.DataFrame object
        This is the sorted dataframe which contains columns in this sequential manner. It contains citation df,
         total_keywords, group_keywords_counts, and keywords_counts in the last.
    pdf_full_text_search_count : list
        This is the list of all citations search result which contains our all search_words_object count.
        Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count,
        "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count,
        "pricing": count,...}]
    unique_preprocessed_clean_grouped_keywords_dict : dict
        This is the dictionary comprised of unique search_words_object in each keyword groups. It means keyword from first keyword
        group can not be found in any other keyword group.
        Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2':
        ["corporate", "pricing"],...}.
        'risk' is removed from keyword_group_2.

    Returns
    -------
    pandas.DataFrame object
        This dataframe contains citations details from filtered and sorted citation full text dataframe and search_words_object
        counts from searching in pdf file text.

    """
    criteria_list = get_sorting_keywords_criterion_list(unique_preprocessed_clean_grouped_keywords_dict)
    filter_sorted_citations_details = filter_sorted_citations_df.drop(columns=criteria_list)

    citations_list = converter.dataframe_to_records_list(filter_sorted_citations_details)
    matched_list, unmatched_list = validation.deep_validate_column_details_between_two_record_list(
        citations_list, pdf_full_text_search_count, first_column_name, second_column_name)
    final_review_df = converter.records_list_to_dataframe(matched_list)

    return final_review_df