Source code for systematic_review.converter

"""Module: converter
This module contains functions related to files and data type conversion. such as list to txt file, pandas df to list of
dicts and many more.
"""
import json
from collections import defaultdict
from typing import Union, List, Dict, Any

import pandas as pd
import rispy

from systematic_review import os_utils, string_manipulation


[docs]def dataframe_to_csv_file(dataframe_object: pd.DataFrame, output_filename: Union[str, None] = "output.csv",
                          index: bool = True):
    """
    This function saves pandas.DataFrame to csv file.

    Parameters
    ----------
    dataframe_object : pandas.DataFrame object
        this is the object of python library pandas. for more lemma_info: https://pandas.pydata.org/docs/
    output_filename : str
        This is the name of output file which should contains .csv extension
    index : bool
        Define if index is needed in output csv file or not.

    Returns
    -------

    """
    dataframe_object.to_csv(output_filename, index=index)


[docs]def dataframe_to_excel_file(dataframe_object: pd.DataFrame, output_filename: Union[str, None] = "output.csv",
                            index: bool = True):
    """
    This function saves pandas.DataFrame to excel file.

    Parameters
    ----------
    dataframe_object : pandas.DataFrame object
        this is the object of python library pandas. for more lemma_info: https://pandas.pydata.org/docs/
    output_filename : str
        This is the name of output file which should contains .xlsx extension
    index : bool
        Define if index is needed in output excel file or not.

    Returns
    -------

    """
    dataframe_object.to_excel(output_filename, index=index)


[docs]def dataframe_to_records_list(dataframe: pd.DataFrame) -> List[Dict[str, Any]]:
    """converts pandas dataframe to the list of dictionaries (records).

    Parameters
    ----------
    pd.DataFrame
        This is the pandas dataframe consisted of all data from dictionaries converted into respective rows.

    Returns
    -------
    List[Dict[str, Any]]
        This list contains the dictionaries inside as elements. Example - [{'primary_title' : "this is first title"},
        {'primary_title' : "this is second title"}, {'primary_title' : "this is third title"}]

    """
    list_of_dicts = dataframe.to_dict('records')
    return list_of_dicts


[docs]def extract_pandas_df_column1_row_values_based_on_column2_value(pandas_dataframe, column2_value,
                                                                column2name="source_name", column1name="article_name"):
    """extract the values of pandas dataframe column1's row_values based on values of column2 value

    Parameters
    ----------
    pandas_dataframe : pd.DataFrame
        This is the pandas dataframe containing at least two columns with values.
    column2_value : object
        This should be str in normal cases but can be any object type supported in pandas for column value.
    column2name : str
        This is the name of the column by which we are extracting the column1 values.
    column1name : str
        This is the name of the column whose values we require.

    Returns
    -------
    list
        This is the list of the resultant values from column1 rows.

    """
    pandas_dataframe = pandas_dataframe.loc[pandas_dataframe[column2name] == column2_value]
    dataframe_dict = pandas_dataframe.to_dict('records')
    article_name_list = []
    for i in dataframe_dict:
        article_name_list.append(i[column1name])
    print(article_name_list)
    return article_name_list


[docs]def apply_custom_function_on_dataframe_column(dataframe: pd.DataFrame, column_name: str, custom_function,
                                              new_column_name: str = None, *args, **kwargs) -> pd.DataFrame:
    """This apply custom_text_manipulation_function function to all element of dataframe column.

    Parameters
    ----------
    new_column_name : str
        This is the new name you want to give your modified column and new column will be added to dataframe without
        modifying original column.
    dataframe : pd.DataFrame
        This is the pandas dataframe consisting of column name with elements capable to be transformed with
        custom_text_manipulation_function function.
    column_name : str
        name of dataframe column whose elements are needed to be transformed
    custom_function
        This is custom_text_manipulation_function function to be applied on each elements of the pandas column elements.

    Returns
    -------
    pd.DataFrame
        This is transformed dataframe.

    """
    if new_column_name:
        dataframe[new_column_name] = dataframe[column_name].apply(lambda x: custom_function(x, *args, **kwargs))
    else:
        dataframe[column_name] = dataframe[column_name].apply(lambda x: custom_function(x, *args, **kwargs))
    return dataframe


[docs]def add_preprocess_column(dataframe_object: pd.DataFrame, column_name: str = "title", ):
    """Takes dataframe and column name to apply preprocess function from string_manipulation module.

    Parameters
    ----------
    dataframe_object : pandas.DataFrame object
        This is object with column containing column which needs to be preprocessed.
    column_name : str
        This is the name of the column of dataframe.

    Returns
    -------
    pandas.DataFrame object
        DataFrame with additional column with preprocessed column.

    """
    new_column_name = "cleaned_" + column_name
    dataframe_object = apply_custom_function_on_dataframe_column(
        dataframe_object, column_name, string_manipulation.text_manipulation_methods,
        text_manipulation_method_name="preprocess_string_to_space_separated_words", new_column_name=new_column_name)
    return dataframe_object


[docs]def dataframe_column_counts(dataframe, column_name):
    """Equivalent to pandas.DataFrame.value_counts(), It return list with count of unique element in column

    Parameters
    ----------
    dataframe : pd.DataFrame
        dataframe which contains column that is to be counted
    column_name : str
        Name of pandas column elements are supposed to be counted.

    Returns
    -------
    object
        unique column elements with counts

    """
    return dataframe[column_name].value_counts()


[docs]def try_convert_dataframe_column_elements_to_list(dataframe: pd.DataFrame, column_name: str) -> List[list]:
    """try statement for converting each element of dataframe column to list object.

    Parameters
    ----------
    dataframe : pd.DataFrame
        The dataframe with column to convert into list
    column_name : str
        Name of column for conversion

    Returns
    -------
    List[list]
        This is list with each element of type list.

    """
    keyword_list_of_list = []

    for keyword_list in dataframe[column_name]:
        try:
            keyword_list_of_list.append(list(keyword_list))
        except TypeError:
            print(f"'{keyword_list}' can not be converted to list")
    return keyword_list_of_list


[docs]def dict_key_value_to_records(dictionary: dict, key_column_name: str, value_column_name: str):
    """converts {'key':value, key1: value1},etc to record = [{'key_column_name': key, value_column_name: value}, etc].
    that is used to convert to pd.DataFrame

    Parameters
    ----------
    dictionary : dict
        hash map or dictionary that contains key and value pairs.
    key_column_name : str
        name of records column
    value_column_name : str
        name of records column

    Returns
    -------
    list
        This list is in records format.

    """
    keywords_list_of_dicts = []
    for key, value in dictionary.items():
        keywords_list_of_dicts.append({key_column_name: key, value_column_name: value})
    return keywords_list_of_dicts


[docs]def unpack_list_of_lists_with_optional_apply_custom_function(list_of_lists: List[list], custom_function=None) -> list:
    """unpack lists inside of list to new list containing all the elements from list_of_lists with optional
    custom_function applied on all elements. example- [[1,2,3], [3,4,5]] to [1,2,3,3,4,5]

    Parameters
    ----------
    list_of_lists : List[list]
        This list contains lists as elements which might contains other elements.
    custom_function
        This is optional function to be applied on each element of list_of_lists

    Returns
    -------
    list
        list containing all the elements with any optional transformation using custom_function.

    """
    unpacked_list = []
    for element_list in list_of_lists:
        if custom_function:
            temp_list = [custom_function(i) for i in element_list]
            unpacked_list.extend(temp_list)
        else:
            unpacked_list.extend(element_list)
    return unpacked_list


[docs]def unpack_list_of_lists(list_of_lists):
    """unpack list consisting of other list to output list which will include all elements from other lists.

    Parameters
    ----------
    list_of_lists : list
        this is list consisting of elements and lists. example ["first_element", ["second_element"]]

    Returns
    -------
    list
        This is the resultant list consisting of only elements. example ["first_element", "second_element"]

    """
    unpacked_list = []
    for element in list_of_lists:
        if type(element) is list:
            unpacked_list += element
        else:
            unpacked_list.append(element)
    return unpacked_list


[docs]def records_list_to_dataframe(list_of_dicts: List[Dict[str, Any]]) -> pd.DataFrame:
    """converts the list of dictionaries to pandas dataframe.

    Parameters
    ----------
    list_of_dicts : List[Dict[str, Any]]
        This list contains the dictionaries inside as elements. Example - [{'primary_title' : "this is the title"}]

    Returns
    -------
    pd.DataFrame
        This is the pandas dataframe consisted of all data from dictionaries converted into respective rows.

    """
    dataframe = pd.DataFrame.from_records(list_of_dicts)
    return dataframe


[docs]def ris_file_to_records_list(ris_file_path: str) -> List[Dict[str, Any]]:
    """Converts .ris file to list of dictionaries of citations using rispy(https://pypi.org/project/rispy/).
    For more lemma_info on ris format, visit: https://en.wikipedia.org/wiki/RIS_(file_format)

    Parameters
    ----------
    ris_file_path : str
        This is the filepath of the ris file.

    Returns
    -------
    List[Dict[str, Any]]
        This list contains dictionaries of citations in records format, same as in pandas.

    """
    with open(ris_file_path, 'r') as bibliography_file:
        ris_list_of_dict = rispy.load(bibliography_file)
        source_name = os_utils.get_filename_from_path(ris_file_path)
        for dictionary in ris_list_of_dict:
            dictionary["source"] = source_name

    return ris_list_of_dict


[docs]def ris_file_to_pandas_dataframe(ris_file_path: str) -> pd.DataFrame:
    """
    This needs 'rispy' to read ris to list of dicts. It then convert list of dicts to pandas.DataFrame

    Parameters
    ----------
    ris_file_path : str
        This is the path of ris citations file

    Returns
    -------
    pd.DataFrame
        dataframe object from pandas

    """
    with open(ris_file_path, 'r') as bibliography_file:
        entries = rispy.load(bibliography_file)
        df = records_list_to_dataframe(entries)
        return df


[docs]def load_multiple_ris_citations_files(citations_files_parent_folder_path: str) -> List[dict]:
    """This function loads all ris citations files from folder

    Parameters
    ----------
    citations_files_parent_folder_path : str
        this is the path of parent folder of where citations files exists.

    Returns
    -------
    List[dict]
        this is list of citations dicts inclusive of all citation files.

    """
    citations_path_lists = os_utils.extract_files_path_from_directories_or_subdirectories(
        citations_files_parent_folder_path)
    citations_list = []
    for path in citations_path_lists:
        if path.endswith(".ris"):
            citations_list += ris_file_to_records_list(path)
    return citations_list


[docs]def load_multiple_ris_citations_files_to_dataframe(citations_files_parent_folder_path: str) -> pd.DataFrame:
    """This function loads all ris citations files from folder

    Parameters
    ----------
    citations_files_parent_folder_path : str
        this is the path of parent folder of where citations files exists.

    Returns
    -------
    pd.DataFrame
        this is dataframe of citations dicts inclusive of all citation files.

    """
    full_list = load_multiple_ris_citations_files(citations_files_parent_folder_path)
    full_list_df = records_list_to_dataframe(full_list)

    return full_list_df


[docs]def list_to_text_file(filename: str, list_name: str, permission: str = "w"):
    """This converts list to text file and put each element in new line.

    Parameters
    ----------
    filename : str
        This is the name to be given for text file.
    list_name : list
        This is the python data structure list which contains some data.
    permission : str
        These are the os permissions given for the file. check more lemma_info on python library 'os'.

    Returns
    -------
    None

    """
    with open(filename, permission) as file:
        for i in list_name:
            file.write(str(i))
            file.write("\n")


[docs]def list_to_string(list_name):
    """This converts list to text_string and put each element in new line.

    Parameters
    ----------
    list_name : list
        This is the python data structure list which contains some data.

    Returns
    -------
    str
        This is the text string comprises of all data of list.

    """
    text_string = ""
    for item in list_name:
        text_string += str(item)
        text_string += "\n"
    return text_string


[docs]def dict_values_data_type(dictionary):
    """This provide the data type of dictionary values by outputting dictionary.

    Parameters
    ----------
    dictionary : dict
        This is the dictionary which contains different types of object in values. Example - {"first": [2, 5], "sec": 3}

    Returns
    -------
    dict
        This will output {"<class 'list'>": ["first"], "<class 'int'>": ["sec"]}

    """
    dictionary_info = defaultdict(list)
    for key, value in dictionary.items():
        dictionary_info[str(type(value))].append(key)
    return dictionary_info


[docs]def text_file_to_list(file_path: str, permission: str = "r"):
    """This converts text file to list and put each line in list as single element. get first line of text file by
    list[0].

    Parameters
    ----------
    file_path : str
        This is the name to be given for text file.
    permission : str
        These are the os permissions given for the file. check more lemma_info on python library 'os'.

    Returns
    -------
    list
        This contains all lines loaded into list with one line per list element. [first line, second line,.... ]

    """
    with open(file_path, permission) as file:
        file_object = file.read()

    return file_object.split("\n")


[docs]def load_text_file(file_path: str, permission: str = "r"):
    """This reads text file. get all line of text file by file object. for more info visit-
    https://docs.python.org/3/tutorial/inputoutput.html

    Parameters
    ----------
    file_path : str
        This is the path or name of text file.
    permission : str
        These are the os permissions given for the file.

    Returns
    -------
    file object
        This contains all lines loaded.

    """
    with open(file_path, permission) as file:
        file_object = file.read()

    return file_object


[docs]def remove_empty_lines(input_file_path: str, output_filename: str = "output_file.ris") -> None:
    """
    This function removes the blank lines from the input file and output new file.

    Parameters
    ----------
    input_file_path : str
        this is the path of input file
    output_filename : str
        this is the name of the output ris file with extension.

    Returns
    -------
    None

    """
    input_file = open(input_file_path, "r")
    output_file = open(output_filename, "a")
    for line in input_file:
        if line != "\n":
            output_file.write(line)
    input_file.close()
    output_file.close()


[docs]def write_json_file_with_dict(output_file_path: str, input_dict: dict) -> None:
    """Write json file at output_file_path with the help of input dictionary.

    Parameters
    ----------
    output_file_path : str
        This is the path of output file we want, if only name is provided then it will export json to the script path.
    input_dict : dict
        This is the python dictionary which we want to be saved in json file format.

    Returns
    -------
    None
        Function doesn't return anything but write a json file at output_file_path.

    """
    with open(output_file_path, "w") as outfile:
        json.dump(input_dict, outfile)


[docs]def json_file_to_dict(json_file_path: str) -> dict:
    """Read the json file from the path given. Convert json file data to the python dictionary.

    Parameters
    ----------
    json_file_path : str
        This is the json file path which is needed to be converted.

    Returns
    -------
    dict
        This is the data in dict format converted from json file.

    """
    with open(json_file_path, 'r') as openfile:
        # Reading from json file
        json_object = json.load(openfile)
        return json_object


[docs]def get_text_from_pdf_pdftotext(pdf_file_path: str, pages: str = "all") -> str:
    """Extract the text from pdf file via pdftotext. for more lemma_info, visit: https://pypi.org/project/pdftotext/

    Parameters
    ----------
    pdf_file_path : str
        This is the path of the pdf file.
    pages : str
        This could be 'all' to get full text of pdf and 'first' for first page of pdf.

    Returns
    -------
    str
        This is the required text from pdf file.

    """
    pdf_object = get_pdf_object_from_pdf_path(pdf_file_path)
    if pages == "first":
        text = pdf_object[0]
    elif pages == "all":
        text = ""
        for pages in pdf_object:
            text += pages
    else:
        text = pdf_object[pages]
    return text


[docs]def get_pdf_object_from_pdf_path(pdf_file_path: str):
    """Extract text as pdf object from the pdf file where loop and indexing can show text per pages.

    Parameters
    ----------
    pdf_file_path : str
        This is the path of pdf file.

    Returns
    -------

        This is pdf object with Extracted text.

    """
    try:
        import pdftotext
    except ImportError:
        print("""This function requires pdftotext library to read pdfs.

        step 1. install OS Dependencies:
        These instructions assume you're using Python 3 on a recent OS.
        - Debian, Ubuntu, and friends
        sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
        - Fedora, Red Hat, and friends
        sudo yum install gcc-c++ pkgconfig poppler-cpp-devel python3-devel
        - macOS
        brew install pkg-config poppler python
        - Windows (Install poppler through conda)
        conda install -c conda-forge poppler

        step 2. Install pdftotext
        pip install pdftotext

        for more info, please visit https://pypi.org/project/pdftotext/""")
        return ""

    with open(pdf_file_path, "rb") as pdf_file:
        pdf_text_object = pdftotext.PDF(pdf_file)
    return pdf_text_object


[docs]def get_text_from_pdf_pymupdf(pdf_file_path: str, pages: str = 'all') -> str:
    """Extract the text from pdf file via fitz(PyMuPDF). for more lemma_info, visit: https://pypi.org/project/PyMuPDF/

    Parameters
    ----------
    pages : str
        This could be 'all' to get full text of pdf and 'first' for first page of pdf.
    pdf_file_path : str
        This is the path of pdf file.

    Returns
    -------
    str
        This is the required text from pdf file.

    """
    try:
        import fitz
    except ImportError:
        print("""This function requires pymupdf library to read pdfs.

        Install pymupdf using: 
        python -m pip install --upgrade pip
        python -m pip install --upgrade pymupdf

        for more info, please visit https://pypi.org/project/PyMuPDF/""")
        return ""

    with fitz.open(pdf_file_path) as doc:
        text = ""
        if pages == "first":
            for page in doc:
                text += page.get_text()
                return text
        elif pages == "all":
            for page in doc:
                text += page.get_text()
    return text


[docs]def get_text_from_pdf(pdf_file_path: str, pages: str = 'all', pdf_reader: str = 'pdftotext') -> Union[str, bool]:
    """This Function get text from pdf files using either pdftotext or pymupdf.

    Parameters
    ----------
    pdf_reader : str
        This is python pdf reader package which convert pdf to text.
    pdf_file_path : str
        This is the path of pdf file.
    pages : str
        This could be 'all' to get full text of pdf and 'first' for first page of pdf.

    Returns
    -------
    str
        This is the required text from pdf file.

    """
    try:
        if pdf_reader == 'pdftotext':
            pdf_text = get_text_from_pdf_pdftotext(pdf_file_path, pages)
            return pdf_text
        elif pdf_reader == 'pymupdf':
            pdf_text = get_text_from_pdf_pymupdf(pdf_file_path, pages)
            return pdf_text
        else:
            print("Not Implemented")
    except Exception:
        return ""


[docs]def get_text_from_multiple_pdf_reader(pdf_file_path: str, pages: str = 'all') -> Union[str, bool]:
    """This Function get text from pdf files using pdftotext. if failed then text comes from pymupdf.

    Parameters
    ----------
    pdf_file_path : str
        This is the path of pdf file.
    pages : str
        This could be 'all' to get full text of pdf and 'first' for first page of pdf.

    Returns
    -------
    str
        This is the required text from pdf file.

    """
    pdf_text = ""
    try:
        pdf_text = get_text_from_pdf_pdftotext(pdf_file_path, pages)
    except Exception:
        pass
    if pdf_text == "":
        try:
            pdf_text = get_text_from_pdf_pymupdf(pdf_file_path, pages)
        except Exception:
            pass
    return pdf_text


[docs]class ASReview:
    def __init__(self, data: Union[List[dict], pd.DataFrame]):
        """This class export citation files using dataframe or records list in csv file format.

        Parameters
        ----------
        data :
            data could be List[dict] and pd.DataFrame

        """
        self.data = data

[docs]    def get_file(self, output_filename: str = "output.csv", index: bool = True):
        """Outputs the file needed to start project in ASReview.

        Parameters
        ----------
        output_filename : str
            name or path of your needed file.
        index : bool
            asks if you need index column in output file.

        Returns
        -------

        """
        if type(self.data) == pd.DataFrame:
            dataframe = self.data.copy()
            dataframe['label_included'] = ""
            dataframe_to_csv_file(dataframe, output_filename, index)
        elif type(self.data) == list:
            df = records_list_to_dataframe(self.data)
            dataframe = df.copy()
            dataframe['label_included'] = ""
            dataframe_to_csv_file(dataframe, output_filename, index)
        else:
            raise NotImplementedError(f"data type {type(self.data)} not Implemented, Use List[dict] and pd.DataFrame.")


[docs]class Reader:
    """Contains functionality to read files.

    """

    def __init__(self, file_path: str):
        """Needs file path to read a file.

        Parameters
        ----------
        file_path : str
            path of the file.
        """
        self.file_path = file_path
        self.file_extension = os_utils.get_file_extension_from_path(self.file_path)

[docs]    def get_text(self, pages: str = 'all'):
        """It understand the type of file and output the content of file.

        Parameters
        ----------
        pages : str
            contain option to read 'first' or 'all' pages.

        Returns
        -------
        str
            This is text in readable file.

        """
        if self.file_extension == "pdf":
            return get_text_from_multiple_pdf_reader(self.file_path, pages)
        elif self.file_extension == "csv":
            return self.pandas_reader("read_csv")
        elif self.file_extension[0] == "x":
            return self.pandas_reader("read_excel")
        elif self.file_extension == "json":
            return json_file_to_dict(self.file_path)
        else:
            return load_text_file(self.file_path)

[docs]    def pdf_pdftotext_reader(self, pages: str = 'all'):
        """Extract the text from pdf file via pdftotext. for more lemma_info, visit: https://pypi.org/project/pdftotext/

        Parameters
        ----------
        pages : str
            This could be 'all' to get full text of pdf and 'first' for first page of pdf.

        Returns
        -------
        str
            This is the required text from pdf file.

        """
        try:
            pdf_text = get_text_from_pdf_pdftotext(self.file_path, pages)
        except Exception:
            pdf_text = ""

        return pdf_text

[docs]    def pdf_pymupdf_reader(self, pages: str = 'all'):
        """Extract the text from pdf file via fitz(PyMuPDF). for more lemma_info, visit: https://pypi.org/project/PyMuPDF/

        Parameters
        ----------
        pages : str
            This could be 'all' to get full text of pdf and 'first' for first page of pdf.

        Returns
        -------
        str
            This is the required text from pdf file.

        """
        try:
            pdf_text = get_text_from_pdf_pymupdf(self.file_path, pages)
        except Exception:
            pdf_text = ""

        return pdf_text

[docs]    def pandas_reader(self, input_file_type):
        """Read file using pandas IO https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

        Parameters
        ----------
        input_file_type : str
            check pandas IO for examples like read_csv, read_excel etc.

        Returns
        -------
        str
            This is the required text from pandas IO.

        """
        dataframe = getattr(pd, input_file_type)(self.file_path)
        return dataframe