Source code for systematic_review.citation

"""Module: citation
This module contains functions which changes format or get details from citations. It also include functions to fix some
typos.
"""

import re
from typing import Literal, List, Dict, Any, Union

import pandas as pd
from systematic_review import string_manipulation, search_count
from systematic_review import converter


[docs]def citations_to_ris_converter(input_file_path: str, output_filename: str = "output_ris_file.ris", input_file_type: str = "read_csv") -> None: """ This asks for citations columns name from tabular data and then convert the data to ris format. Parameters ---------- input_file_path : str this is the path of input file output_filename : str this is the name of the output ris file with extension. output file path is also valid choice. input_file_type : str this function default is csv but other formats are also supported by putting 'read_{file_type}'. such as input_file_type = 'read_excel' all file type supported by pandas can be used by putting pandas IO tools methods. for more info visit- https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html Returns ------- None """ df = getattr(pd, input_file_type)(input_file_path) pd.set_option("display.max_columns", None) print(df.head()) print("please specify column names for following data in input file:") article_type = input("specify column for type of given citations. if it's journal article, input :JOUR") authors = input("provide name of authors column") publication_year = input("provide name of Publication Year column") item_title = input("provide name of Item Title column") publication_title = input("provide name of Publication Title column") journal_volume = input("provide name of Journal Volume column") journal_issue = input("provide name of Journal Issue column") url = input("provide name of URL column") doi = input("provide name of Item DOI column") number_of_records = len(df) output_file = open(output_filename, "a") for index in range(number_of_records): output_file.write(f"TY - {article_type}\n") output_file.write("AU - " + str(df.iloc[index][authors]) + "\n") output_file.write("PY - " + str(df.iloc[index][publication_year]) + "\n") output_file.write("TI - " + str(df.iloc[index][item_title]) + "\n") output_file.write("JO - " + str(df.iloc[index][publication_title]) + "\n") output_file.write("VL - " + str(df.iloc[index][journal_volume]) + "\n") output_file.write("IS - " + str(df.iloc[index][journal_issue]) + "\n") output_file.write("UR - " + str(df.iloc[index][url]) + "\n") output_file.write("DO - " + str(df.iloc[index][doi]) + "\n") output_file.write("ER -" + "\n") # \n is placed to indicate EOL (End of Line) output_file.close() print("ris file has been generated")
[docs]def edit_ris_citation_paste_values_after_regex_pattern(input_file_path: str, output_filename: str = "output_file.ris", edit_line_regex: str = r'^DO ', paste_value: str = "ER - ") \ -> None: """ This is created to edit ris files which doesn't specify ER for 'end of citations' and paste ER after end point of citation, replace 'DO' with other ris classifiers such as TY, JO etc. Parameters ---------- input_file_path : str this is the path of input file output_filename : str this is the name of the output ris file with extension. edit_line_regex : str this is the regex to find ris classifiers lines such as DO, TY, JO etc. paste_value : str this is value to be pasted, most helpful is ER ris classifier which signify citation end. Returns ------- None """ input_file = open(input_file_path, "r") output_file = open(output_filename, "a") for line in input_file: output_file.write(line) if re.match(edit_line_regex, line): output_file.write(f"{paste_value}\n") input_file.close() output_file.close()
[docs]def get_details_via_article_name_from_citations(article_name: str, sources_name_citations_path_list_of_dict: list, doi_url: bool = False, title_column_name: str = "title") -> dict: """Iterate through citations and find article_name and put source_name in column, with doi and url being optional Parameters ---------- article_name : str This is the primary title of the citation or name of the article. sources_name_citations_path_list_of_dict : list This is the list of all the sources names and it's citations at dir_path. Examples - {'sources_name': 'all source articles citations', ...} doi_url : bool This signify if we want to get the value of url and doi from citation title_column_name : str This is the name of column which contain citation title Returns ------- dict This dict contains the article_name, source_name and optional url and doi """ for citations in sources_name_citations_path_list_of_dict[1]: if article_name == string_manipulation.preprocess_string(citations[title_column_name]): article_title_source_name_dict = {"article_name": article_name, "source_name": sources_name_citations_path_list_of_dict[0]} if doi_url: # optional block if you want url and doi try: article_title_source_name_dict["doi"] = citations["doi"] article_title_source_name_dict["url"] = citations["url"] except KeyError: print("doi or url not present") pass return article_title_source_name_dict
[docs]def get_details_of_all_article_name_from_citations(filtered_list_of_dict: list, sources_name_citations_path_list_of_dict: list, doi_url: bool = False, title_column_name: str = "title") -> list: """This function searches source names, doi, and url for all articles in filtered_list_of_dict. Parameters ---------- filtered_list_of_dict : list This is the list of article citations dict after filtering it using min_limit on grouped_keywords_count sources_name_citations_path_list_of_dict : list This is the list of all the sources names and it's citations at dir_path. Examples - {'sources_name': 'all source articles citations', ...} doi_url : bool This signify if we want to get the value of url and doi from citation title_column_name : str This is the name of column which contain citation title Returns ------- list This list contains all article names with source names. (optional url and doi) """ all_articles_title_source_name_list_of_dict = [] for article_details in filtered_list_of_dict: article_name = article_details[title_column_name] print("article: ", article_name) articles_title_source_name_dict = get_details_via_article_name_from_citations( article_name, sources_name_citations_path_list_of_dict, doi_url) all_articles_title_source_name_list_of_dict.append(articles_title_source_name_dict) return all_articles_title_source_name_list_of_dict
[docs]def get_missed_articles_source_names(missed_articles_list: list, all_articles_title_source_name_list_of_dict: list, article_column_name: str = "article_name", source_column_name: str = "source_name") -> list: """ Parameters ---------- missed_articles_list : list This contains the list of articles that got missed while downloading. all_articles_title_source_name_list_of_dict : list This list contains all article names with source names. (optional url and doi) article_column_name : str This is the name of article column in the all_articles_title_source_name_list_of_dict. source_column_name : str This is the name of source column in the all_articles_title_source_name_list_of_dict. Returns ------- list This list contains articles_name and sources name. """ missed_article_name_and_source_name_list = [] for dict_element in all_articles_title_source_name_list_of_dict: if dict_element[article_column_name] in set(missed_articles_list): missed_article_name_and_source_name_list.append( {"article_name": dict_element[article_column_name], "source_name": dict_element[source_column_name]}) return missed_article_name_and_source_name_list
[docs]def drop_columns_based_on_column_name_list(dataframe: pd.DataFrame, column_name_list: list) -> pd.DataFrame: """This function drop columns based on the column name in the list. Parameters ---------- dataframe : pandas.DataFrame object This dataframe contains columns which we want to drop or remove. column_name_list : list This is the name of dataframe columns to be removed Returns ------- pandas.DataFrame object DataFrame with columns mentioned in column_name_list removed. """ output_df = dataframe.drop(column_name_list, axis=1) return output_df
[docs]def drop_search_words_count_columns(dataframe, search_words_object: search_count.SearchWords) -> pd.DataFrame: """removes columns created based on the keywords. Parameters ---------- dataframe : pandas.DataFrame object This dataframe contains keywords columns which we want to drop or remove. search_words_object : dict This is the dictionary comprised of unique keywords in each keyword groups. It means keyword from first keyword group can not be found in any other keyword group. Example - {'keyword_group_1': ["management", "investing", "risk", "pre", "process"], 'keyword_group_2': ["corporate", "pricing"],...} Returns ------- pandas.DataFrame object DataFrame with keywords columns removed. """ keywords_count_cols = search_words_object.get_sorting_keywords_criterion_list() cleaned_dataframe = drop_columns_based_on_column_name_list(dataframe, keywords_count_cols) return cleaned_dataframe
[docs]def add_multiple_sources_column(citation_dataframe: pd.DataFrame, group_by: list = ['title', 'year']) -> pd.DataFrame: """This function check if citations or article title is available at more than one sources and add column named 'multiple_sources' to the dataframe with list of name of sources names. Parameters ---------- citation_dataframe : pandas.DataFrame object Input dataset which contains citations or article title with sources more than one. group_by : list column label or sequence of labels, optional Only consider certain columns for citations or article title with sources more than one, by default use all of the columns. Returns ------- pandas.DataFrame object DataFrame with additional column with list of sources names """ df = citation_dataframe.groupby(group_by)['source'].apply(list).reset_index() df = df.rename(columns={"source": "multiple_sources"}) citation_dataframe_with_multiple_sources_column = citation_dataframe.merge(df, how='left', on=group_by) return citation_dataframe_with_multiple_sources_column
[docs]def add_citation_text_column(dataframe_object: pd.DataFrame, title_column_name: str = "title", abstract_column_name: str = "abstract", keyword_column_name: str = "keywords") -> pd.DataFrame: """This takes dataframe of citations and return the full text comprises of "title", "abstract", "search_words_object" Parameters ---------- dataframe_object : pandas.DataFrame object this is the object of famous python library pandas. for more lemma_info: https://pandas.pydata.org/docs/ title_column_name : str This is the name of column which contain citation title abstract_column_name : str This is the name of column which contain citation abstract keyword_column_name : str This is the name of column which contain citation search_words_object Returns ------- pd.DataFrame this is dataframe_object comprises of full text column. """ dataframe_object["citation_text"] = dataframe_object[title_column_name].astype(str) + " " + dataframe_object[ abstract_column_name].astype(str) + " " + dataframe_object[keyword_column_name].astype(str) return dataframe_object
[docs]def drop_duplicates_citations(citation_dataframe: pd.DataFrame, subset: list = ['title', 'year'], keep: Literal["first", "last", False] = 'first', index_reset: bool = True) -> pd.DataFrame: """Return DataFrame with duplicate rows removed. Considering certain columns is optional. Indexes, including time indexes are ignored. Parameters ---------- index_reset : bool It citation_dataframe : pandas.DataFrame object Input dataset which contains duplicate rows subset : list column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. keep : str options includes {'first', 'last', False}, default 'first'. Determines which duplicates (if any) to keep. - ``first`` : Drop duplicates except for the first occurrence. - ``last`` : Drop duplicates except for the last occurrence. - False : Drop all duplicates. Returns ------- pandas.DataFrame object DataFrame with duplicates removed """ clean_df = citation_dataframe.drop_duplicates(subset=subset, keep=keep).reset_index(drop=index_reset) return clean_df
[docs]class Citations: def __init__(self, citations_files_parent_folder_path, title_column_name: str = "title", text_manipulation_method_name: str = "preprocess_string_to_space_separated_words"): """ Parameters ---------- citations_files_parent_folder_path : str this is the path of parent folder of where citations files exists. title_column_name text_manipulation_method_name """ self.text_manipulation_method_name = text_manipulation_method_name self.title_column_name = title_column_name self.citations_files_parent_folder_path = citations_files_parent_folder_path
[docs] def create_citations_dataframe(self) -> pd.DataFrame: """Executes citation step. This function load all the citations from path, add required columns for next steps, and remove duplicates. Returns ------- pandas.DataFrame object DataFrame with additional columns needed for next steps of systematic review and duplicates are removed """ full_list = converter.load_multiple_ris_citations_files(self.citations_files_parent_folder_path) full_list_df = converter.records_list_to_dataframe(full_list) complete_df = add_multiple_sources_column(full_list_df) complete_df = add_citation_text_column(complete_df) new_column_name = "cleaned_" + self.title_column_name complete_df = converter.apply_custom_function_on_dataframe_column(complete_df, self.title_column_name, string_manipulation.text_manipulation_methods, new_column_name, self.text_manipulation_method_name) complete_citations_df = drop_duplicates_citations(complete_df) return complete_citations_df
[docs] def get_records_list(self) -> List[Dict[str, Any]]: """Executes citation step. This function load all the citations from path, add required columns for next steps, and remove duplicates. Returns ------- List[Dict[str, Any]] list with additional columns needed for next steps of systematic review and duplicates are removed """ return converter.dataframe_to_records_list(self.create_citations_dataframe())
[docs] def get_dataframe(self): """executes the create citations dataframe function and outputs the pd.DataFrame Returns ------- pd.DataFrame outputs the citations data. """ return self.create_citations_dataframe()
[docs] def to_csv(self, output_filename: Union[str, None] = "output.csv", index: bool = True): """This function saves pandas.DataFrame to csv file. Parameters ---------- output_filename : str This is the name of output file which should contains .csv extension index : bool Define if index is needed in output csv file or not. Returns ------- """ converter.dataframe_to_csv_file(self.get_dataframe(), output_filename, index)
[docs] def to_excel(self, output_filename: Union[str, None] = "output.csv", index: bool = True): """This function saves pandas.DataFrame to excel file. Parameters ---------- output_filename : str This is the name of output file which should contains .xlsx extension index : bool Define if index is needed in output excel file or not. Returns ------- """ converter.dataframe_to_excel_file(self.get_dataframe(), output_filename, index)