Source code for systematic_review.search_count

"""Module: search_count
This module contains all necessary functions for searching the citations, articles text and count number of search_words_object
present.
"""

import pandas as pd

from typing import List, Union, Dict, Any

from systematic_review import string_manipulation, validation
from systematic_review import converter


[docs]def remove_duplicates_keywords_from_next_groups(preprocessed_clean_grouped_keywords_dict: dict) -> dict: """Execute search_words_object step. This takes search_words_object from {keyword_group_name: search_words_object,...} dict and remove symbols with spaces. it then convert them to lowercase and remove any duplicate keyword inside of search_words_object. outputs the {keyword_group_name: [clean_keywords],...} and then Remove duplicate instances of search_words_object in other search_words_object groups. Parameters ---------- preprocessed_clean_grouped_keywords_dict : dict This is output dictionary which contains processed non-duplicate search_words_object dict. Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing", "risk"],...} Returns ------- dict This is the dictionary comprised of unique search_words_object in each keyword groups. It means keyword from first keyword group can not be found in any other keyword group. Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} 'risk' is removed from keyword_group_2. """ temp_set = set() temp_preprocessed_clean_grouped_keywords_dict = preprocessed_clean_grouped_keywords_dict.copy() for keyword_group_name, grouped_unique_keywords in preprocessed_clean_grouped_keywords_dict.items(): # appending new search_words_object in temp_set for keywords in grouped_unique_keywords: if keywords not in temp_set: temp_set.add(keywords) else: temp_preprocessed_clean_grouped_keywords_dict[keyword_group_name].remove(keywords) return temp_preprocessed_clean_grouped_keywords_dict
[docs]class SearchWords: """This class contains all functionalities related to search words. """ sample_dict = {'keywords_finance': 'Management investing corporate pricing risk', 'keywords_machine_learning': 'neural fuzzy inference system artificial intelligence artificial ' 'computational neural networks', 'keywords_common_words': 'accuracy classification cross sectional cross-section expected metrics ' 'prediction predict expert system'} def __init__(self, search_words, text_manipulation_method_name: str = "preprocess_string", custom_text_manipulation_function=None, default_search_words_group_name: str = "search_words_group_", all_unique_keywords: bool = False, unique_keywords: bool = True, *args, **kwargs): self.args = args self.kwargs = kwargs self.all_unique_keywords = all_unique_keywords self.default_search_words_group_name = default_search_words_group_name self.custom_text_manipulation_function = custom_text_manipulation_function self.unique_keywords = unique_keywords self.text_manipulation_method_name = text_manipulation_method_name if type(search_words) == str: self.search_words_path = search_words self.value = self.preprocess_searched_keywords(converter.json_file_to_dict(self.search_words_path)) elif type(search_words) == list: self.search_word_list = search_words self.value = self.preprocess_searched_keywords(self.construct_search_words_from_list()) elif type(search_words) == dict: self.search_word_dict = search_words self.value = self.preprocess_searched_keywords(self.search_word_dict) else: print(f"search_words type {type(search_words)} is incorrect, It must be str, list, or dict.")
[docs] def get_sample_search_words_json(self, output_file_path: str = "sample_search_words_template.json") -> None: """Outputs the json sample search_words_object file template as example which can be edited by user to upload search_words_object. Parameters ---------- output_file_path : str this is optional output file path for json template Returns ------- None function create the file on the root folder unless specified in output_file_path """ converter.write_json_file_with_dict(output_file_path, self.sample_dict)
[docs] def unique_keywords_in_preprocessed_clean_keywords_dict(self) -> set: """Return set of unique search_words_object from the preprocessed_clean_keywords_dict. Returns ------- set This is set of unique search_words_object from all of search_words_object groups. """ unique_keywords = set() for keywords_list in self.value.values(): for keywords in keywords_list: unique_keywords.add(keywords) return unique_keywords
[docs] def construct_search_words_from_list(self) -> dict: """ This takes keywords_list which contains search_words_object as ['keyword1 keyword2 keyword3', 'keyword1 keyword2'] and function construct dict as {'keyword_group_1': 'keyword1 keyword2 keyword3', 'keyword_group_2': 'keyword1 keyword2'} Returns ------- dict the dictionary contains the group name and search_words_object paired as value Examples - {'keyword_group_1': 'keyword1 keyword2 keyword3', 'keyword_group_2': 'keyword1 keyword2'} """ suffix = 1 grouped_keywords_dictionary = {} for keywords in self.search_word_list: dictionary_key = self.default_search_words_group_name + str(suffix) grouped_keywords_dictionary[dictionary_key] = keywords suffix += 1 return grouped_keywords_dictionary
[docs] def preprocess_search_keywords_dictionary(self, grouped_keywords_dictionary: dict) -> dict: """This takes search_words_object from {keyword_group_name: search_words_object,...} dict and remove symbols with spaces. it then convert them to lowercase and remove any duplicate keyword inside of search_words_object. outputs the {keyword_group_name: [clean_keywords],...} Parameters ---------- grouped_keywords_dictionary : dict This is the input dictionary of search_words_object used for systematic review. Example - {'keyword_group_name': "Management investing corporate pricing risk Risk Pre-process",...} Returns ------- dict This is output dictionary which contains processed non-duplicate search_words_object dict. Example - {'keyword_group_name': ["management", "investing", "corporate", "pricing", "risk", "pre", "process"],...} """ preprocessed_clean_grouped_keywords_dictionary = {} for keyword_group_name, keywords in grouped_keywords_dictionary.items(): preprocessed_string = string_manipulation.text_manipulation_methods(keywords, self.text_manipulation_method_name, self.custom_text_manipulation_function, self.args, self.kwargs) preprocessed_clean_keywords = string_manipulation.split_words_remove_duplicates( preprocessed_string.split()) if \ self.unique_keywords else preprocessed_string.split() preprocessed_clean_grouped_keywords_dictionary[keyword_group_name] = preprocessed_clean_keywords return preprocessed_clean_grouped_keywords_dictionary
[docs] def preprocess_searched_keywords(self, grouped_keywords_dictionary: dict) -> dict: """Remove duplicate instances of search_words_object in other search_words_object groups. Parameters ---------- grouped_keywords_dictionary : dict This is the input dictionary of search_words_object used for systematic review. Example - {'keyword_group_name': "Management investing corporate pricing risk Risk Pre-process",...} Returns ------- dict This is the dictionary comprised of unique search_words_object in each keyword groups. It means keyword from first keyword group can not be found in any other keyword group. Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} 'risk' is removed from keyword_group_2. """ preprocessed_keywords = self.preprocess_search_keywords_dictionary(grouped_keywords_dictionary) preprocessed_clean_grouped_keywords_dict = remove_duplicates_keywords_from_next_groups( preprocessed_keywords) if \ self.all_unique_keywords else preprocessed_keywords return preprocessed_clean_grouped_keywords_dict
[docs] def creating_default_keyword_count_dict(self): """Initialise keyword count dict with value 0 for every keyword. Returns ------- dict This contains key as keyword and value as 0. """ keyword_count_dict = {"total_keywords": 0} for group_name, keywords_list in self.value.items(): group_name_count = str(group_name) + "_count" keyword_count_dict.update({group_name_count: 0}) for keyword in keywords_list: keyword_count_dict.update({keyword: 0}) return keyword_count_dict
[docs] def get_sorting_keywords_criterion_list(self) -> List[str]: """This sorting criteria list is based on the search_words_object got from the main input search_words_object. It contains total_keywords, group_keywords_counts, keywords_counts. Returns ------- List[str] This is the sorting criterion list which contains column in logical manner we desire. It contains total_keywords, group_keywords_counts, and search_words_object in the last. """ sorting_keywords_criterion_list = ["total_keywords"] for keyword_group_name in self.value.keys(): group_name_count = str(keyword_group_name) + "_count" sorting_keywords_criterion_list.append(group_name_count) for keywords_list in self.value.values(): for keyword in keywords_list: sorting_keywords_criterion_list.append(keyword) return sorting_keywords_criterion_list
[docs] def generate_keywords_count_dictionary(self, text): """ Parameters ---------- text Returns ------- """ empty_keyword_count_dict = self.creating_default_keyword_count_dict() total_keywords_counts = 0 for searched_word in text.split(): # checking the word in grouped search_words_object and add to full_keywords_count_dict. for keyword_group_name, unique_keywords in self.value.items(): if searched_word in unique_keywords: total_keywords_counts += 1 group_name_count = str(keyword_group_name) + "_count" empty_keyword_count_dict[group_name_count] += 1 empty_keyword_count_dict[searched_word] += 1 empty_keyword_count_dict["total_keywords"] = total_keywords_counts return empty_keyword_count_dict
[docs]class SearchCount: """Used to search search_words in citations and research papers. This can output both records list and pandas.DataFrame as well as can take both inputs. """ download_flag_column_name = 'downloaded' research_paper_file_location_column_name = 'file location' citation_text_column_name = "citation_text" def __init__(self, data: Union[List[dict], pd.DataFrame], search_words_object: SearchWords, text_manipulation_method_name: str = "preprocess_string", custom_text_manipulation_function=None, *args, **kwargs): """Set up all necessary data for start counting. Parameters ---------- data : Union[List[dict], pd.DataFrame] This dataframe contains all columns with counts of search_words_object. search_words_object : search_count.SearchWords search_words_object should contain dictionary comprised of unique search_words_object in each keyword groups. It means keyword from first keyword group can not be found in any other keyword group. Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} text_manipulation_method_name : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase, preprocess_string_to_space_separated_words custom_text_manipulation_function : function This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as custom_text_manipulation_function = function_name. it will take text as parameter with no default preprocess_string operation. kwargs : Dict[str, Any] These key = word or {key: word} arguments are for custom_text_manipulation_function args : Tuple[Any, Any] These arguments are for custom_text_manipulation_function """ self.args = args self.kwargs = kwargs self.custom_text_manipulation_function = custom_text_manipulation_function self.data = converter.dataframe_to_records_list(data) if type(data) == pd.DataFrame else data self.text_manipulation_method_name = text_manipulation_method_name self.search_words_object = search_words_object
[docs] def counts(self) -> List[Dict[str, Any]]: """This takes records list and return search counts based on type of citation data or research papers data. Returns ------- List[Dict[str, Any]] records list containing the citation data or research papers data. """ if (self.download_flag_column_name in self.data[0]) and ( self.research_paper_file_location_column_name in self.data[0]): return self.count_search_words_in_research_paper_text(self.data) else: return self.count_search_words_in_citations_text(self.data)
[docs] def count_search_words_in_citations_text(self, citations_records_list: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """Loop over each citations to count search words (SearchWords instance) in citation data. Parameters ---------- citations_records_list : List[Dict[str, Any]] This list contains all the citations details with column named 'full_text' containing full text like article name, abstract and keyword. Returns ------- List[Dict[str, Any]] This is the list of all citations search result which contains our all search_words_object count. Examples - [{'title': 'name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ final_list_of_full_search_words_counts_citations_dict = [] # iterating through each citation details one by one. for citation_dict in citations_records_list: # changing the text string based on text manipulation text_manipulation_method_name name text = string_manipulation.text_manipulation_methods(citation_dict[self.citation_text_column_name], self.text_manipulation_method_name, self.custom_text_manipulation_function, self.args, self.kwargs) # taking words one by one from full_text of citation. search_words_counts_dict = self.search_words_object.generate_keywords_count_dictionary(text) # adding citations with search_words_counts full_search_words_counts_dict = {**citation_dict, **search_words_counts_dict} # putting citation record in final_list_of_full_search_words_counts_citations_dict final_list_of_full_search_words_counts_citations_dict.append(full_search_words_counts_dict) return final_list_of_full_search_words_counts_citations_dict
[docs] def count_search_words_in_research_paper_text(self, research_papers_records_list: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """Loop over validated research paper to count search words (SearchWords instance) in research papers data. Parameters ---------- research_papers_records_list : List[Dict[str, Any]] This list contains data of all the research papers files contained in directory_path. Returns ------- List[Dict[str, Any]] This is the list of all citations search result which contains our all search_words_object count. Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ final_list_of_full_search_words_counts_citations_dict = [] # iterating through each citation details one by one. for research_papers_record in research_papers_records_list: # changing the text string based on text manipulation text_manipulation_method_name name if research_papers_record[self.download_flag_column_name] != "yes": continue research_paper_text = converter.Reader( research_papers_record[self.research_paper_file_location_column_name]).get_text() text = string_manipulation.text_manipulation_methods(research_paper_text, self.text_manipulation_method_name, self.custom_text_manipulation_function, self.args, self.kwargs) # taking words one by one from full_text of citation. search_words_counts_dict = self.search_words_object.generate_keywords_count_dictionary(text) # adding citations with search_words_counts full_search_words_counts_dict = {**research_papers_record, **search_words_counts_dict} # putting citation record in final_list_of_full_search_words_counts_citations_dict final_list_of_full_search_words_counts_citations_dict.append(full_search_words_counts_dict) return final_list_of_full_search_words_counts_citations_dict
[docs] def get_records_list(self) -> List[Dict[str, Any]]: """Outputs the records list containing counts results of input data. Returns ------- List[Dict[str, Any]] This is the list of records which contains all search_words_object count from input data. Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ return self.counts()
[docs] def get_dataframe(self): """Outputs the pandas.DataFrame containing counts results of input data. Returns ------- pandas.DataFrame This is the dataframe of all citations search result which contains our all search_words_object count. Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ return converter.records_list_to_dataframe(self.counts())
[docs] def to_csv(self, output_filename: Union[str, None] = "output.csv", index: bool = True): """This function saves pandas.DataFrame to csv file. Parameters ---------- output_filename : str This is the name of output file which should contains .csv extension index : bool Define if index is needed in output csv file or not. Returns ------- """ converter.dataframe_to_csv_file(self.get_dataframe(), output_filename, index)
[docs] def to_excel(self, output_filename: Union[str, None] = "output.csv", index: bool = True): """This function saves pandas.DataFrame to excel file. Parameters ---------- output_filename : str This is the name of output file which should contains .xlsx extension index : bool Define if index is needed in output excel file or not. Returns ------- """ converter.dataframe_to_excel_file(self.get_dataframe(), output_filename, index)
[docs]def adding_dict_key_or_increasing_value(input_dict: dict, dict_key: str, step: int = 1, default_dict_value: int = 1): """Increase the value of dict(key:value) by step using key. If key not present then it get initialised with default dict value Parameters ---------- input_dict : dict This is the dictionary which we want to modify. dict_key : str This is the key of dictionary step : int This is the addition number by which value of dictionary needed to be increased. default_dict_value : int If key is not available in dictionary then this default value is used to add new key. Returns ------- dict This is the modified dictionary """ if dict_key in input_dict.keys(): input_dict[dict_key] += step else: input_dict[dict_key] = default_dict_value return input_dict
[docs]def count_words_in_list_of_lists(list_of_lists: List[list]) -> dict: """count words in list containing other lists with words. Parameters ---------- list_of_lists : List[list] This list contains each element of type list. Returns ------- dict dictionary with key as words and value as counts """ dict_with_words_count = {} for keyword_list in list_of_lists: for keyword in keyword_list: clean_keyword = string_manipulation.preprocess_string(keyword) dict_with_words_count = adding_dict_key_or_increasing_value(dict_with_words_count, clean_keyword) return dict_with_words_count
[docs]def count_keywords_in_citations_full_text_list(citations_with_fulltext_list: list, unique_preprocessed_clean_grouped_keywords_dict: dict, title_column_name: str = "title", method: str = "preprocess_string", custom=None) -> list: """Loop over articles to calculate search_words_object counts Parameters ---------- custom : function This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as custom_text_manipulation_function = function_name. it will take text as parameter with no default preprocess_string operation. method : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase citations_with_fulltext_list : list This list contains all the citations details with column named 'full_text' containing full text like article name, abstract and keyword. unique_preprocessed_clean_grouped_keywords_dict : dict looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} title_column_name : str This is the name of column which contain citation title Returns ------- list This is the list of all citations search result which contains our all search_words_object count. Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ final_list_of_full_keywords_counts_citations_dict = [] keyword_count_dict = creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict) # iterating through each citation details one by one. for citation_dict in citations_with_fulltext_list: print(f"article: {citation_dict[title_column_name]}") full_keywords_counts_dict = citation_dict full_keywords_counts_dict.update(keyword_count_dict) total_keywords_counts = 0 citation_full_text = string_manipulation.text_manipulation_methods(citation_dict['citation_text'], method, custom).split() # taking words one by one from full_text of citation. for searched_word in citation_full_text: # checking the word in grouped search_words_object and add to full_keywords_count_dict. for keyword_group_name, unique_keywords in unique_preprocessed_clean_grouped_keywords_dict.items(): if searched_word in unique_keywords: total_keywords_counts += 1 group_name_count = str(keyword_group_name) + "_count" full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict, group_name_count) full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict, searched_word) full_keywords_counts_dict.update({"total_keywords": total_keywords_counts}) final_list_of_full_keywords_counts_citations_dict.append(full_keywords_counts_dict) return final_list_of_full_keywords_counts_citations_dict
[docs]def creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict: dict): """Initialise keyword count dict with value 0 for every keyword. Parameters ---------- unique_preprocessed_clean_grouped_keywords_dict : dict looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} Returns ------- dict This contains key as keyword and value as 0. """ keyword_count_dict = {} for group_name, keywords_list in unique_preprocessed_clean_grouped_keywords_dict.items(): group_name_count = str(group_name) + "_count" keyword_count_dict.update({group_name_count: 0}) for keyword in keywords_list: keyword_count_dict.update({keyword: 0}) return keyword_count_dict
[docs]def count_search_words_in_citations_text(citations_with_fulltext_list: list, search_words_object: SearchWords, text_column_name: str = "'citation_text'", text_manipulation_method_name: str = "preprocess_string", custom=None, custom_text_manipulation_function=None, *args, **kwargs) -> list: """Loop over articles to calculate search_words_object counts Parameters ---------- custom_text_manipulation_function : function This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as custom_text_manipulation_function = function_name. it will take text as parameter with no default preprocess_string operation. text_manipulation_method_name : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase citations_with_fulltext_list : list This list contains all the citations details with column named 'full_text' containing full text like article name, abstract and keyword. search_words_object : object looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} text_column_name : str This is the name of column which contain citation text Returns ------- list This is the list of all citations search result which contains our all search_words_object count. Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ final_list_of_full_search_words_counts_citations_dict = [] # iterating through each citation details one by one. for citation_dict in citations_with_fulltext_list: # changing the text string based on text manipulation text_manipulation_method_name name text = string_manipulation.text_manipulation_methods(citation_dict[text_column_name], text_manipulation_method_name, custom_text_manipulation_function, args, kwargs) # taking words one by one from full_text of citation. search_words_counts_dict = search_words_object.generate_keywords_count_dictionary(text) # adding citations with search_words_counts full_search_words_counts_dict = {**citation_dict, **search_words_counts_dict} # putting citation record in final_list_of_full_search_words_counts_citations_dict final_list_of_full_search_words_counts_citations_dict.append(full_search_words_counts_dict) return final_list_of_full_search_words_counts_citations_dict
[docs]def citation_list_of_dict_search_count_to_df(citations_list: list, keywords: dict, title_column_name: str = "title", method: str = "preprocess_string", custom=None) -> pd.DataFrame: """Loop over articles to calculate search_words_object counts and return dataframe. Parameters ---------- title_column_name : str This is the name of column which contain citation title custom : function This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as custom_text_manipulation_function = function_name. it will take text as parameter with no default preprocess_string operation. method : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase citations_list : list list with additional columns needed for next steps of systematic review and duplicates are removed keywords : dict This is output dictionary which contains processed non-duplicate search_words_object dict. Example - {'keyword_group_name': ["management", "investing", "corporate", "pricing", "risk", "pre", "process"],...} Returns ------- pandas.DataFrame object This is pandas object of all citations search result which contains our all search_words_object count. Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ citations_keywords_count_list = count_keywords_in_citations_full_text_list(citations_list, keywords, title_column_name, method, custom) citation_search_count_df = converter.records_list_to_dataframe(citations_keywords_count_list) return citation_search_count_df
[docs]def count_keywords_in_citations_full_text(dataframe_citations_with_fulltext: pd.DataFrame, unique_preprocessed_clean_grouped_keywords_dict: dict, title_column_name: str = "title", method: str = "preprocess_string", custom=None) -> list: """Loop over articles to calculate keywords counts Parameters ---------- custom : function This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will take text as parameter with no default preprocess_string operation. method : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom - for putting your custom function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase dataframe_citations_with_fulltext : pd.DataFrame This dataframe contains all the citations details with column named 'full_text' containing full text like article name, abstract and keyword. unique_preprocessed_clean_grouped_keywords_dict : dict looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} title_column_name : str This is the name of column which contain citation title Returns ------- list This is the list of all citations search result which contains our all keywords count. Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ final_list_of_full_keywords_counts_citations_dict = [] keyword_count_dict = creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict) # iterating through each citation details one by one. for _, row in dataframe_citations_with_fulltext.iterrows(): print(f"article: {row[title_column_name]}") full_keywords_counts_dict = {title_column_name: str(row[title_column_name])} full_keywords_counts_dict.update(keyword_count_dict) total_keywords_counts = 0 citation_full_text = string_manipulation.text_manipulation_methods(row['full_text'], method, custom).split() # taking words one by one from full_text of citation. for searched_word in citation_full_text: # checking the word in grouped keywords and add to full_keywords_count_dict. for keyword_group_name, unique_keywords in unique_preprocessed_clean_grouped_keywords_dict.items(): if searched_word in unique_keywords: total_keywords_counts += 1 group_name_count = str(keyword_group_name) + "_count" full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict, group_name_count) full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict, searched_word) full_keywords_counts_dict.update({"total_keywords": total_keywords_counts}) final_list_of_full_keywords_counts_citations_dict.append(full_keywords_counts_dict) return final_list_of_full_keywords_counts_citations_dict
[docs]def citation_search_count_dataframe(citations_df: pd.DataFrame, keywords: dict, title_column_name: str = "title", method: str = "preprocess_string", custom=None) -> pd.DataFrame: """Loop over articles to calculate keywords counts and return dataframe. Parameters ---------- title_column_name : str This is the name of column which contain citation title custom : function This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will take text as parameter with no default preprocess_string operation. method : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom - for putting your custom function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase citations_df : pandas.DataFrame object DataFrame with additional columns needed for next steps of systematic review and duplicates are removed keywords : dict This is output dictionary which contains processed non-duplicate keywords dict. Example - {'keyword_group_name': ["management", "investing", "corporate", "pricing", "risk", "pre", "process"],...} Returns ------- pandas.DataFrame object This is pandas object of all citations search result which contains our all keywords count. Examples - [{'primary_title': 'name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ citations_keywords_count_list = count_keywords_in_citations_full_text(citations_df, keywords, title_column_name, method, custom) citation_search_count_df = converter.records_list_to_dataframe(citations_keywords_count_list) return citation_search_count_df
[docs]def count_keywords_in_pdf_full_text(list_of_downloaded_articles_path: list, unique_preprocessed_clean_grouped_keywords_dict: dict, title_column_name: str = "cleaned_title_pdf", method: str = "preprocess_string", custom=None) -> list: """Loop over articles pdf files to calculate keywords counts. Parameters ---------- custom : function This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will take text as parameter with no default preprocess_string operation. method : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom - for putting your custom function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase title_column_name : str This is the name of column which contain citation title list_of_downloaded_articles_path : list This list contains path of all the pdf files contained in directory_path. unique_preprocessed_clean_grouped_keywords_dict : dict looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} Returns ------- list This is the list of all citations search result which contains our all keywords count. Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ final_list_of_full_keywords_counts_pdf_text_dict = [] keyword_count_dict = creating_keyword_count_dict(unique_preprocessed_clean_grouped_keywords_dict) # iterating through each pdf path one by one. for pdf_path in list_of_downloaded_articles_path: article_name = string_manipulation.preprocess_string_to_space_separated_words( string_manipulation.pdf_filename_from_filepath(pdf_path)) print("article: ", article_name) full_keywords_counts_dict = {title_column_name: str(article_name)} full_keywords_counts_dict.update(keyword_count_dict) total_keywords_counts = 0 try: pdf_text = converter.get_text_from_multiple_pdf_reader(pdf_path) except FileNotFoundError: continue pdf_full_text = string_manipulation.text_manipulation_methods(pdf_text, method, custom).split() # taking words one by one from full_text of pdf file. for searched_word in pdf_full_text: # checking the word in grouped keywords and add to full_keywords_count_dict. for keyword_group_name, unique_keywords in unique_preprocessed_clean_grouped_keywords_dict.items(): if searched_word in unique_keywords: total_keywords_counts += 1 group_name_count = str(keyword_group_name) + "_count" full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict, group_name_count) full_keywords_counts_dict = adding_dict_key_or_increasing_value(full_keywords_counts_dict, searched_word) full_keywords_counts_dict.update({"total_keywords": total_keywords_counts}) final_list_of_full_keywords_counts_pdf_text_dict.append(full_keywords_counts_dict) return final_list_of_full_keywords_counts_pdf_text_dict
[docs]def pdf_full_text_search_count_dataframe(list_of_downloaded_articles_path: list, unique_preprocessed_clean_grouped_keywords_dict: dict, title_column_name: str = "cleaned_title", method: str = "preprocess_string", custom=None ) -> pd.DataFrame: """Loop over articles pdf files to calculate keywords counts. Parameters ---------- custom : function This is optional custom function if you want to implement this yourself. pass as custom = function_name. it will take text as parameter with no default preprocess_string operation. method : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom - for putting your custom function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase title_column_name : str This is the name of column which contain citation title list_of_downloaded_articles_path : list This list contains path of all the pdf files contained in directory_path. unique_preprocessed_clean_grouped_keywords_dict : dict looks like this {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} Returns ------- pandas.DataFrame object This is the dataframe of all citations search result which contains our all keywords count. Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] """ pdf_full_text_keywords_count_list = count_keywords_in_pdf_full_text(list_of_downloaded_articles_path, unique_preprocessed_clean_grouped_keywords_dict, title_column_name, method, custom) pdf_full_text_search_count_df = converter.records_list_to_dataframe(pdf_full_text_keywords_count_list) return pdf_full_text_search_count_df
[docs]def get_sorting_keywords_criterion_list(unique_preprocessed_clean_grouped_keywords_dict: dict) -> list: """This sorting criteria list is based on the keywords got from the main input keywords. It contains total_keywords, group_keywords_counts, keywords_counts. Parameters ---------- unique_preprocessed_clean_grouped_keywords_dict : dict his is the dictionary comprised of unique keywords in each keyword groups. It means keyword from first keyword group can not be found in any other keyword group. Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...}. 'risk' is removed from keyword_group_2. Returns ------- list This is the sorting criterion list which contains column in logical manner we desire. It contains total_keywords, group_keywords_counts, and keywords in the last. """ sorting_keywords_criterion_list = ["total_keywords"] for keyword_group_name in unique_preprocessed_clean_grouped_keywords_dict.keys(): group_name_count = str(keyword_group_name) + "_count" sorting_keywords_criterion_list.append(group_name_count) for keywords_list in unique_preprocessed_clean_grouped_keywords_dict.values(): for keyword in keywords_list: sorting_keywords_criterion_list.append(keyword) return sorting_keywords_criterion_list
[docs]def adding_citation_details_with_keywords_count_in_pdf_full_text(filter_sorted_citations_df: pd.DataFrame, pdf_full_text_search_count: list, unique_preprocessed_clean_grouped_keywords_dict: dict, first_column_name: str = "cleaned_title", second_column_name: str = 'cleaned_title_pdf') -> pd.DataFrame: """Combining the pdf search_words_object counts with the citation details from filtered and sorted citation full text dataframe. Parameters ---------- second_column_name : str This is the name of column which contain pdf article title. first_column_name : str This is the name of column which contain citation title. filter_sorted_citations_df : pandas.DataFrame object This is the sorted dataframe which contains columns in this sequential manner. It contains citation df, total_keywords, group_keywords_counts, and keywords_counts in the last. pdf_full_text_search_count : list This is the list of all citations search result which contains our all search_words_object count. Examples - [{'article': 'article_name', 'total_keywords': count, 'keyword_group_1_count': count, "management": count, "investing: count", "risk: count", 'keyword_group_2_count': count, "corporate": count, "pricing": count,...}] unique_preprocessed_clean_grouped_keywords_dict : dict This is the dictionary comprised of unique search_words_object in each keyword groups. It means keyword from first keyword group can not be found in any other keyword group. Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...}. 'risk' is removed from keyword_group_2. Returns ------- pandas.DataFrame object This dataframe contains citations details from filtered and sorted citation full text dataframe and search_words_object counts from searching in pdf file text. """ criteria_list = get_sorting_keywords_criterion_list(unique_preprocessed_clean_grouped_keywords_dict) filter_sorted_citations_details = filter_sorted_citations_df.drop(columns=criteria_list) citations_list = converter.dataframe_to_records_list(filter_sorted_citations_details) matched_list, unmatched_list = validation.deep_validate_column_details_between_two_record_list( citations_list, pdf_full_text_search_count, first_column_name, second_column_name) final_review_df = converter.records_list_to_dataframe(matched_list) return final_review_df