Source code for systematic_review.validation

"""Module: validation
This module contains functions related validating our downloaded articles if they're same as ones we require. It also
contains functions to get articles source name and create list of missed or duplicate articles.
"""

from difflib import SequenceMatcher
import pandas as pd
from typing import List, Union, Dict, Any

from systematic_review import string_manipulation
from systematic_review import converter
from systematic_review import os_utils


[docs]def get_dataframe_column_as_list(dataframe: pd.DataFrame, column_name: str = 'primary_title'):
    """Get pandas dataframe column values as list.

    Parameters
    ----------
    dataframe : pd.DataFrame
        This is the dataframe which contains column whose details we want as list.
    column_name : str
        This is the name of the column.

    Returns
    -------
    list
        This is the list containing the dataframe one column values.

    """
    column_values_list = dataframe[column_name].to_list()
    return column_values_list


[docs]def similarity_sequence_matcher(string_a: str, string_b: str) -> float:
    """Shows the percentage similarity between two strings like 0.9836065573770492 that means 98.35%

    Parameters
    ----------
    string_a : str
        This is first string
    string_b : str
        This is second string

    Returns
    -------
    float
        This is the result of SequenceMatcher Example 0.9836065573770492 that means 98.35%

    """
    return SequenceMatcher(None, string_a, string_b).ratio()


[docs]def calculate_percentage(value: float, total: float) -> float:
    """calculate percentage of value in total.

    Parameters
    ----------
    value : float
        It is input number, normally smaller than total.
    total : float
        It is the larger number from which we want to know percentage

    Returns
    -------
    float
        This is calculated percentage. Example 98.36065573770492 that means 98.35%

    """
    percentage = (value / total) * 100
    return percentage


[docs]def amount_by_percentage(number: float, percentage: float) -> float:
    """get the amount of number based on percentage. example- 5% (percentage) of 10 (number) is 0.5 (result).

    Parameters
    ----------
    number : float
        This is the input number from which we want some percent amount
    percentage : float
        This is equivalent to math percentage.

    Returns
    -------
    float
        This is resultant number.

    """
    return number * percentage / 100


[docs]def add_dict_element_with_count(dictionary: dict, key: str) -> dict:
    """It increase the value by checking the key of dictionary or initialise new key with value 1. Works as collections
    module default dict with value 1.

    Parameters
    ----------
    dictionary : dict
        This is the dictionary where we want to add element.
    key : str
        This is the key of dictionary {key: value}

    Returns
    -------
    dict
        This is the edited dict with new elements counts.

    """
    if key in dictionary.keys():
        dictionary[key] += 1
    else:
        dictionary[key] = 1
    return dictionary


[docs]def dict_from_list_with_element_count(input_list):
    """Put input list elements into dictionary with count.

    Parameters
    ----------
    input_list : list
        This is the list with elements with some duplicates present.

    Returns
    -------
    dict
        This is dictionary key as list elements and value as list each element count

    """
    output_dict = dict()
    for key in input_list:
        add_dict_element_with_count(output_dict, key)

    return output_dict


[docs]def validate_column_details_between_two_record_list(first_list_of_dict: list, second_list_of_dict: list,
                                                    first_column_name: str = "cleaned_title", second_column_name: str =
                                                    'cleaned_title_pdf') -> tuple:
    """It produce list of matched columns rows and unmatched column rows based on same column from first list of dict.
    Note- emphasis on first list as function check all records of first list of dict in second list of dict.
    title column of second_list_of_dict is kept by merging with first.

    Parameters
    ----------
    second_column_name : str
        This is the name of column which contain pdf article title.
    first_list_of_dict : list
        Iterable object pandas.DataFrame or list which contains first_column_name
    second_list_of_dict : list
        Iterable object pandas.DataFrame or list which contains first_column_name
    first_column_name : str
        This is the name of column which contain citation title.

    Returns
    -------
    tuple
        matched_list - It contains column's row which are matched in both data object.
        unmatched_list - It contains column's row which are unmatched in both data object.

    """
    matched_list = []
    unmatched_list = []
    for article_name in first_list_of_dict:
        validation_bool, percentage_matched, method = True, 0, None
        for article_count in second_list_of_dict:

            validation_bool, percentage_matched, method = multiple_methods_validating_words_string_in_text(
                article_name[first_column_name], article_count[second_column_name])
            # print(f"validation_bool: {validation_bool}, percentage_matched: {percentage_matched},
            # text_manipulation_method_name: {text_manipulation_method_name}")
            if validation_bool:
                article_name_count = {**article_name, **article_count}
                matched_list.append(article_name_count)
                break

        if not validation_bool:
            unmatched_list.append([article_name[first_column_name], percentage_matched, method])

    print(f"matched_list count = {len(matched_list)}, unmatched_list count = {len(unmatched_list)}")
    return matched_list, unmatched_list


[docs]def deep_validate_column_details_between_two_record_list(first_list_of_dict: list, second_list_of_dict: list,
                                                         first_column_name: str = "cleaned_title",
                                                         second_column_name: str = 'cleaned_title_pdf') -> tuple:
    """It produce list of matched columns rows and unmatched column rows based on same column from both.

    Parameters
    ----------
    second_column_name : str
        This is the name of column which contain pdf article title.
    first_list_of_dict : list
        Iterable object pandas.DataFrame or list which contains first_column_name
    second_list_of_dict : list
        Iterable object pandas.DataFrame or list which contains first_column_name
    first_column_name : str
        This is the name of column which contain citation title.

    Returns
    -------
    tuple
        matched_list - It contains column's row which are matched in both data object.
        unmatched_list - It contains column's row which are unmatched in both data object.

    """
    import copy
    temp_first_list_of_dict = copy.deepcopy(first_list_of_dict)
    temp_second_list_of_dict = copy.deepcopy(second_list_of_dict)

    matched_list = []
    for first_dict in temp_first_list_of_dict:
        if first_column_name in first_dict:

            for second_dict in temp_second_list_of_dict:
                if second_column_name in second_dict:
                    if first_dict[first_column_name] == second_dict[second_column_name]:
                        article_name_count = {**first_dict, **second_dict}
                        matched_list.append(article_name_count)
                        temp_first_list_of_dict.remove(first_dict)
                        temp_second_list_of_dict.remove(second_dict)
                        break

    unmatched_list = temp_first_list_of_dict + temp_second_list_of_dict

    return matched_list, unmatched_list


[docs]def compare_two_dict_members_via_percent_similarity(first_dict: dict, second_dict: dict) -> float:
    """Compare elements in 2 dictionaries and return percentage similarity.

    Parameters
    ----------
    first_dict : dict
        Example - first_dict = {'mixed':1, 'modified':1, 'fruit':1, 'fly':1, 'optimization':1}
    second_dict : dict
        Example - second_dict = {'mixed':1, 'modified':1, 'fruit':1, 'fly':1, 'optimization':1, 'algorithm': 1}

    Returns
    -------
    float
        This is percentage represented as decimal number. Example 98.36065573770492 that means 98.35%

    """
    similar_dict_keys_count = 0
    total_dict_keys_count = 0
    all_dict_keys = {**first_dict, **second_dict}
    for key, value in all_dict_keys.items():
        if key in first_dict and key in second_dict:
            if first_dict[key] == second_dict[key]:
                same_values_in_dict = (2 * first_dict[key])
                similar_dict_keys_count += same_values_in_dict
                total_dict_keys_count += same_values_in_dict
            else:
                diff = abs(first_dict[key] - second_dict[key])
                same_values_in_dict = first_dict[key] if first_dict[key] < second_dict[key] else second_dict[key]
                similar_dict_keys_count += same_values_in_dict
                total_dict_keys_count += (diff + same_values_in_dict)
        else:
            total_dict_keys_count += all_dict_keys[key]

    percent_similarity = calculate_percentage(similar_dict_keys_count, total_dict_keys_count)

    return percent_similarity


[docs]def compare_two_list_members_via_percent_similarity(words_list: list, boolean_membership_list: list) -> float:
    """Compare elements in 2 lists and return percentage similarity.

    Parameters
    ----------
    words_list : list
        This contains elements whose elements to be checked for similarity.
    boolean_membership_list : list
        This list contains True and False values.

    Returns
    -------
    float
        This is percentage represented as decimal number. Example 98.36065573770492 that means 98.35%

    """
    words_found_in_boolean_membership_list = 0
    length_of_words_list = len(words_list)

    for word_indicator in boolean_membership_list:
        if word_indicator:
            words_found_in_boolean_membership_list += 1

    percent_similarity = calculate_percentage(words_found_in_boolean_membership_list, length_of_words_list)
    return percent_similarity


[docs]def exact_words_checker_in_text(words_string: str, text_string: str) -> bool:
    """This checks for exact match of substring in string and return True or False based on success.

    Parameters
    ----------
    words_string : str
        This is the word we are searching for.
    text_string : str
        This is query string or lengthy text.

    Returns
    -------
    bool
        This returns True if exact words_string found in text_string else False.

    """
    if (type(words_string) != str) or (type(text_string) != str):
        raise TypeError
    words_list = string_manipulation.split_preprocess_string(words_string)
    words_list_length = len(words_list)
    words_list_end_element_index = words_list_length - 1

    words_set = set(words_list)
    # words_dict_membership = dict_from_list_with_element_count(words_list)

    text_list = string_manipulation.split_preprocess_string(text_string)

    validation_bool = False
    searching_flag = False

    for word_of_text in text_list:
        if searching_flag:
            if word_of_text == words_list[searching_index]:
                if searching_index == words_list_end_element_index:
                    validation_bool = True
                    return validation_bool
                searching_index += 1
            else:
                searching_flag = False

        if word_of_text in words_set:
            if word_of_text == words_list[0]:
                # starting_index = words_list.index(word_of_text)
                searching_flag = True
                searching_index = 1

    return validation_bool


[docs]def words_percentage_checker_in_text(words_string: str, text_string: str, validation_limit: float = 70) -> tuple:
    """This  checks for exact match of substring in string and return True or False based on success. It also returns
    matched word percentage.
    Limit: this doesn't work properly if words_string have duplicate words.

    Parameters
    ----------
    words_string : str
        This is the word we are searching for.
    text_string : str
        This is query string or lengthy text.
    validation_limit : float
        This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.

    Returns
    -------
    tuple
        This returns True if exact words_string found in text_string else False.
        This also returns matched substring percentage.

    """
    words_list = string_manipulation.split_preprocess_string(words_string)
    words_list_length = len(words_list)
    # words_list_end_element_index = words_list_length - 1

    words_set = set(words_list)
    # words_dict_membership = dict_from_list_with_element_count(words_list)

    text_list = string_manipulation.split_preprocess_string(text_string)

    temp_list = [False] * words_list_length
    validation_bool = False
    # searching_flag = False
    word_list_element_index = -1
    percentage_matched = 0

    for word_of_text in text_list:
        if word_of_text in words_set:
            word_of_text_index_in_words_list = words_list.index(word_of_text)

            if word_of_text_index_in_words_list > word_list_element_index:
                word_list_element_index = word_of_text_index_in_words_list
                temp_list[word_of_text_index_in_words_list] = True

                percentage_matched = compare_two_list_members_via_percent_similarity(words_list, temp_list)
                validation_bool = True if percentage_matched > validation_limit else False
                if validation_bool:
                    return validation_bool, percentage_matched

            else:
                temp_list = [False] * words_list_length
                temp_list[word_of_text_index_in_words_list] = True
        else:
            temp_list = [False] * words_list_length
            word_list_element_index = -1

    return validation_bool, percentage_matched


[docs]def jumbled_words_percentage_checker_in_text(words_string: str, text_string: str, validation_limit: float = 70,
                                             wrong_word_limit: int = 2) -> tuple:
    """start calculating percentage if half of words are found in sequence. This also takes in consideration of words
    which got jumbled up due to pdf reading operation.

    Parameters
    ----------
    words_string : str
        This is the word we are searching for.
    text_string : str
        This is query string or lengthy text.
    validation_limit : float
        This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
    wrong_word_limit : int
        This is the limit unto which algorithm ignore the wrong word in sequence.

    Returns
    -------
    tuple
        This returns True if exact words_string found in text_string else False.
        This also returns matched substring percentage.

    """
    words_list = string_manipulation.split_preprocess_string(words_string)
    # words_list_length = len(words_list)
    # words_list_end_element_index = words_list_length - 1

    words_set = set(words_list)
    words_dict_membership = dict_from_list_with_element_count(words_list)

    text_list = string_manipulation.split_preprocess_string(text_string)

    validation_bool = False
    percentage_matched = 0
    skipped_words = 0

    temp_dict = dict()
    for word_of_text in text_list:
        if word_of_text in words_set:
            skipped_words = 0
            add_dict_element_with_count(temp_dict, word_of_text)
        else:
            skipped_words += 1
            if skipped_words >= wrong_word_limit:
                temp_dict = dict()
                continue
            percentage_matched = compare_two_dict_members_via_percent_similarity(words_dict_membership, temp_dict)
            validation_bool = True if percentage_matched > validation_limit else False
            if validation_bool:
                return validation_bool, percentage_matched
            temp_dict = dict()

    return validation_bool, percentage_matched


[docs]def validating_pdf_via_filename(pdf_file_path: str, pages: str = "first", method: str = "exact_words") -> bool:
    """This function checks name of file and find the name in the text of pdf file. if it become successful then pdf is
    validated as downloaded else not downloaded. Example - pdf file name -> check in -> text of pdf file. pdf_reader
    options are pdftotext or pymupdf.

    Parameters
    ----------
    pdf_file_path : str
        the path of the pdf file.
    pages : str
        This could be 'all' to get full text of pdf and 'first' for first page of pdf.
    method : str
        This is the switch option to select text_manipulation_method_name from exact_words, words_percentage, jumbled_words_percentage.

    Returns
    -------
    bool
        True and False value depicting validated article with True value.

    """
    text = converter.get_text_from_pdf(pdf_file_path, pages)
    # print(text)
    pdf_filename = os_utils.get_filename_from_path(pdf_file_path)
    pdf_filename = string_manipulation.strip_string_from_right_side(pdf_filename)

    if method == "exact_words":
        validation_bool = exact_words_checker_in_text(pdf_filename, text)
    elif method == "words_percentage":
        validation_bool, percentage_matched = words_percentage_checker_in_text(pdf_filename, text)
    elif method == "jumbled_words_percentage":
        validation_bool, percentage_matched = jumbled_words_percentage_checker_in_text(pdf_filename, text)
    else:
        validation_bool = False
        print(
            "Please properly write the text_manipulation_method_name name, as text_manipulation_method_name name is not available")

    return validation_bool


[docs]def multiple_methods_validating_pdf_via_filename(pdf_file_path: str, pages: str = "first",
                                                 pdf_reader: str = 'pdftotext') -> tuple:
    """This function checks name of file and find the name in the text of pdf file. if it become successful then pdf is
    validated as downloaded else not downloaded. Example - pdf file name -> check in -> text of pdf file. pdf_reader
    options are pdftotext or pymupdf.

    Parameters
    ----------
    pdf_reader : str
        This is python pdf reader package which convert pdf to text.
    pdf_file_path : str
        the path of the pdf file.
    pages : str
        This could be 'all' to get full text of pdf and 'first' for first page of pdf.

    Returns
    -------
    tuple
        True and False value depicting validated article with True value.
        This also shows percentage matched
        Last it shows the text_manipulation_method_name used. like exact_words, words_percentage, jumbled_words_percentage, all if every text_manipulation_method_name
        is executed to validate.

    """
    # percentage_matched = 0
    text = converter.get_text_from_pdf(pdf_file_path, pages, pdf_reader)
    # print(text)
    pdf_filename = os_utils.get_filename_from_path(pdf_file_path)

    return multiple_methods_validating_words_string_in_text(pdf_filename, text)


[docs]def validating_multiple_pdfs_via_filenames(list_of_pdf_files_path: list, pages: str = "first",
                                           pdf_reader: str = 'pdftotext') -> tuple:
    """This function checks pdf files in list_of_pdf_files_path and validate them with function named
    'validating_pdf_via_filename'. Example - multiple pdf file name -> check in -> text of pdf file.
    pdf_reader options are pdftotext or pymupdf.

    Parameters
    ----------
    pages : str
        This could be 'all' to get full text of pdf and 'first' for first page of pdf.
    pdf_reader : str
        This is python pdf reader package which convert pdf to text.
    list_of_pdf_files_path : list
        the list of the path of the pdf file.

    Returns
    -------
    tuple
        validated_pdf_list - contains name of pdf files whose filename is in the pdf text
        invalidated_pdf_list - list of name of files which can't be included in validated_pdf_list
        manual_pdf_list - list of files which can't be opened using python pdf reader or errors opening them.

    """
    validated_pdf_list = []
    invalidated_pdf_list = []
    manual_pdf_list = []

    try:
        import pdftotext
    except ImportError:
        print("""This function requires pdftotext library to read pdfs.

        step 1. install OS Dependencies:
        These instructions assume you're using Python 3 on a recent OS.
        - Debian, Ubuntu, and friends
        sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
        - Fedora, Red Hat, and friends
        sudo yum install gcc-c++ pkgconfig poppler-cpp-devel python3-devel
        - macOS
        brew install pkg-config poppler python
        - Windows (Install poppler through conda)
        conda install -c conda-forge poppler

        step 2. Install pdftotext
        pip install pdftotext

        for more info, please visit https://pypi.org/project/pdftotext/""")

    for article_name_path in list_of_pdf_files_path:
        try:
            value, percentage_matched, methods = multiple_methods_validating_pdf_via_filename(article_name_path,
                                                                                              pages, pdf_reader)
            if value:
                # print("validated")
                validated_pdf_list.append([article_name_path, percentage_matched, methods])
            elif not value:
                # print("invalidated")
                invalidated_pdf_list.append([article_name_path, percentage_matched, methods])
        except Exception:
            manual_pdf_list.append([article_name_path, 0, None])

    return validated_pdf_list, invalidated_pdf_list, manual_pdf_list


[docs]class ValidateWordsInText:
    """This checks words in given Text.

    """

    def __init__(self, words_string: str, text_string: str,
                 words_percentage_checker_in_text_validation_limit: float = 70,
                 jumbled_words_percentage_checker_in_text_validation_limit: float = 70,
                 jumbled_words_percentage_checker_in_text_wrong_word_limit: int = 2):
        """

        Parameters
        ----------
        words_string : str
            This is the word we are searching for.
        text_string : str
            This is query string or lengthy text.
        jumbled_words_percentage_checker_in_text_wrong_word_limit : float
            This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found
            same.
        jumbled_words_percentage_checker_in_text_validation_limit : int
            This is the limit unto which algorithm ignore the wrong word in sequence.
        words_percentage_checker_in_text_validation_limit : float
            This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found
            same.

        """

        self.words_percentage_checker_in_text_validation_limit = words_percentage_checker_in_text_validation_limit
        self.jumbled_words_percentage_checker_in_text_validation_limit = \
            jumbled_words_percentage_checker_in_text_validation_limit
        self.jumbled_words_percentage_checker_in_text_wrong_word_limit = \
            jumbled_words_percentage_checker_in_text_wrong_word_limit
        self.text_string = text_string
        self.words_string = words_string

[docs]    def exact_words_checker_in_text(self) -> bool:
        """This checks for exact match of substring in string and return True or False based on success.

        Returns
        -------
        bool
            This returns True if exact words_string found in text_string else False.

        """
        if (type(self.words_string) != str) or (type(self.text_string) != str):
            raise TypeError
        words_list = string_manipulation.split_preprocess_string(self.words_string)
        words_list_length = len(words_list)
        words_list_end_element_index = words_list_length - 1

        words_set = set(words_list)
        # words_dict_membership = dict_from_list_with_element_count(words_list)

        text_list = string_manipulation.split_preprocess_string(self.text_string)

        validation_bool = False
        searching_flag = False

        for word_of_text in text_list:
            if searching_flag:
                if word_of_text == words_list[searching_index]:
                    if searching_index == words_list_end_element_index:
                        validation_bool = True
                        return validation_bool
                    searching_index += 1
                else:
                    searching_flag = False

            if word_of_text in words_set:
                if word_of_text == words_list[0]:
                    # starting_index = words_list.index(word_of_text)
                    searching_flag = True
                    searching_index = 1

        return validation_bool

[docs]    def words_percentage_checker_in_text(self) -> tuple:
        """This  checks for exact match of substring in string and return True or False based on success. It also returns
        matched word percentage.
        words_percentage_checker_in_text_validation_limit: this doesn't work properly if words_string have duplicate words.

        Returns
        -------
        tuple
            This returns True if exact words_string found in text_string else False.
            This also returns matched substring percentage.

        """

        words_list = string_manipulation.split_preprocess_string(self.words_string)
        words_list_length = len(words_list)
        # words_list_end_element_index = words_list_length - 1

        words_set = set(words_list)
        # words_dict_membership = dict_from_list_with_element_count(words_list)

        text_list = string_manipulation.split_preprocess_string(self.text_string)

        temp_list = [False] * words_list_length
        validation_bool = False
        # searching_flag = False
        word_list_element_index = -1
        percentage_matched = 0

        for word_of_text in text_list:
            if word_of_text in words_set:
                word_of_text_index_in_words_list = words_list.index(word_of_text)

                if word_of_text_index_in_words_list > word_list_element_index:
                    word_list_element_index = word_of_text_index_in_words_list
                    temp_list[word_of_text_index_in_words_list] = True

                    percentage_matched = compare_two_list_members_via_percent_similarity(words_list, temp_list)
                    validation_bool = True if \
                        percentage_matched > self.words_percentage_checker_in_text_validation_limit else False
                    if validation_bool:
                        return validation_bool, percentage_matched

                else:
                    temp_list = [False] * words_list_length
                    temp_list[word_of_text_index_in_words_list] = True
            else:
                temp_list = [False] * words_list_length
                word_list_element_index = -1

        return validation_bool, percentage_matched

[docs]    def jumbled_words_percentage_checker_in_text(self) -> tuple:
        """start calculating percentage if half of words are found in sequence. This also takes in consideration of words
        which got jumbled up due to pdf reading operation.

        Returns
        -------
        tuple
            This returns True if exact words_string found in text_string else False.
            This also returns matched substring percentage.

        """

        words_list = string_manipulation.split_preprocess_string(self.words_string)
        # words_list_length = len(words_list)
        # words_list_end_element_index = words_list_length - 1

        words_set = set(words_list)
        words_dict_membership = dict_from_list_with_element_count(words_list)

        text_list = string_manipulation.split_preprocess_string(self.text_string)

        validation_bool = False
        percentage_matched = 0
        skipped_words = 0

        temp_dict = dict()
        for word_of_text in text_list:
            if word_of_text in words_set:
                skipped_words = 0
                add_dict_element_with_count(temp_dict, word_of_text)
            else:
                skipped_words += 1
                if skipped_words >= self.jumbled_words_percentage_checker_in_text_wrong_word_limit:
                    temp_dict = dict()
                    continue
                percentage_matched = compare_two_dict_members_via_percent_similarity(words_dict_membership, temp_dict)
                validation_bool = True if \
                    percentage_matched > self.jumbled_words_percentage_checker_in_text_validation_limit else False
                if validation_bool:
                    return validation_bool, percentage_matched
                temp_dict = dict()

        return validation_bool, percentage_matched

[docs]    def multiple_methods(self) -> tuple:
        """This text_manipulation_method_name uses different methods to validate the article_name(substring) in text.
        Example - exact_words, words_percentage, jumbled_words_percentage.

        Returns
        -------
        tuple
            True and False value depicting validated article with True value.
            This also shows percentage matched
            Last it shows the text_manipulation_method_name used. like exact_words, words_percentage, jumbled_words_percentage, all if every text_manipulation_method_name
            is executed to validate.

        """
        # percentage_matched = 0
        validation_bool = exact_words_checker_in_text(self.words_string, self.text_string)
        if validation_bool:
            return validation_bool, 1, "exact_words"
        validation_bool, percentage_matched = words_percentage_checker_in_text(
            self.words_string, self.text_string, self.words_percentage_checker_in_text_validation_limit)
        if validation_bool:
            return validation_bool, percentage_matched, "words_percentage"
        validation_bool, percentage_matched = jumbled_words_percentage_checker_in_text(
            self.words_string, self.text_string, self.jumbled_words_percentage_checker_in_text_validation_limit,
            self.jumbled_words_percentage_checker_in_text_wrong_word_limit)
        if validation_bool:
            return validation_bool, percentage_matched, "jumbled_words_percentage"

        return validation_bool, percentage_matched, "all"


[docs]def multiple_methods_validating_words_string_in_text(
        article_name: str, text: str,
        words_percentage_checker_in_text_validation_limit: float = 70,
        jumbled_words_percentage_checker_in_text_validation_limit: float = 70,
        jumbled_words_percentage_checker_in_text_wrong_word_limit: int = 2) -> tuple:
    """This text_manipulation_method_name uses different methods to validate the article_name(substring) in text. Example - exact_words,
    words_percentage, jumbled_words_percentage.

    Parameters
    ----------
    jumbled_words_percentage_checker_in_text_wrong_word_limit : float
        This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
    jumbled_words_percentage_checker_in_text_validation_limit : int
        This is the limit unto which algorithm ignore the wrong word in sequence.
    words_percentage_checker_in_text_validation_limit : float
        This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
    article_name : str
        This is input string which we want to validate in text.
    text : str
        This is query string or lengthy text.

    Returns
    -------
    tuple
        True and False value depicting validated article with True value.
        This also shows percentage matched
        Last it shows the text_manipulation_method_name used. like exact_words, words_percentage, jumbled_words_percentage, all if every text_manipulation_method_name
        is executed to validate.

    """
    # percentage_matched = 0
    validation_bool = exact_words_checker_in_text(article_name, text)
    if validation_bool:
        return validation_bool, 1, "exact_words"
    validation_bool, percentage_matched = words_percentage_checker_in_text(
        article_name, text, words_percentage_checker_in_text_validation_limit)
    if validation_bool:
        return validation_bool, percentage_matched, "words_percentage"
    validation_bool, percentage_matched = jumbled_words_percentage_checker_in_text(
        article_name, text, jumbled_words_percentage_checker_in_text_validation_limit,
        jumbled_words_percentage_checker_in_text_wrong_word_limit)
    if validation_bool:
        return validation_bool, percentage_matched, "jumbled_words_percentage"

    return validation_bool, percentage_matched, "all"


[docs]def finding_missed_articles_from_downloading(validated_pdf_list: list, original_articles_list: list) -> tuple:
    """Checks how many articles are not downloaded yet from original list of articles.

    Parameters
    ----------
    validated_pdf_list : list
        Contains name of pdf files whose filename is in the pdf text.
    original_articles_list : list
        This is original list from where we started downloading the articles.

    Returns
    -------
    tuple
        Missing_articles - these are the articles which are missed from downloading.
        Validated_articles - This is list of validated downloaded articles list.

    """
    validated_pdf_text = converter.list_to_string(validated_pdf_list)
    original_articles_set = set(original_articles_list)

    missing_articles = []
    downloaded_articles = []
    for article_name in original_articles_set:
        validation_bool, percentage_matched, methods = multiple_methods_validating_words_string_in_text(
            article_name, validated_pdf_text)
        if not validation_bool:
            missing_articles.append(article_name)
        elif validation_bool:
            downloaded_articles.append(article_name)
    return missing_articles, downloaded_articles


[docs]def get_missed_original_articles_list(original_article_list: list, downloaded_article_list: list) -> list:
    """This check elemets of the original_article_list in downloaded_article_list and return missed articles list.

    Parameters
    ----------
    original_article_list : list
        This list elements are checked if they are present in other list.
    downloaded_article_list : list
        This list is checked if it consists elements of other list

    Returns
    -------
    list
        This contains missing elements of original_article_list in downloaded_article_list.

    """
    missed_articles_list = []
    for article_name in original_article_list:
        if article_name in set(downloaded_article_list):
            pass
        else:
            missed_articles_list.append(article_name)

    return missed_articles_list


[docs]def get_missed_articles_dataframe(filter_sorted_citations_df: pd.DataFrame, downloaded_articles_path: str,
                                  title_column_name: str = "cleaned_title") -> list:
    """return list of missed articles from downloading by checking original list of articles from
    filter_sorted_citations_df using downloaded articles path.

    Parameters
    ----------
    title_column_name : str
        contains name of column which contain the name of article.
    filter_sorted_citations_df : pd.DataFrame
        This dataframe contains records of selected articles including name of articles.
    downloaded_articles_path : str
        contains parent folder of all the downloaded articles files.

    Returns
    -------
    list
        list of the missed articles from downloading.

    """
    original_list = [i for i in filter_sorted_citations_df[title_column_name]]
    validated_articles_list, invalidated_list, manual_list = validating_pdfs_using_multiple_pdf_reader(
        downloaded_articles_path)
    articles_list = getting_article_paths_from_validation_detail(validated_articles_list)
    downloaded_list = [
        string_manipulation.preprocess_string(os_utils.get_filename_from_path(k))
        for k in articles_list]
    missed_articles = finding_missed_articles_from_downloading(downloaded_list, original_list)
    return missed_articles[0]


[docs]def getting_article_paths_from_validation_detail(list_of_validation: list) -> list:
    """Getting the first element from list of lists.

    Parameters
    ----------
    list_of_validation : list
        This list contain list of three values where the first is article path.

    Returns
    -------
    list
        This output list contains the articles paths

    """
    article_list = [i[0] for i in list_of_validation]
    return article_list


[docs]def validating_pdfs_using_multiple_pdf_reader(pdfs_parent_dir_path: str) -> tuple:
    """This function uses two python readers pdftotext and pymupdf for validating if the filename are present inside of
    pdf file text.

    Parameters
    ----------
    pdfs_parent_dir_path : str
        This is the parent directory of all the downloaded pdfs.

    Returns
    -------
    tuple
        validated_pdf_list - contains name of pdf files whose filename is in the pdf text
        invalidated_pdf_list - list of name of files which can't be included in validated_pdf_list
        manual_pdf_list - list of files which can't be opened using python pdf reader or errors opening them.

    """
    articles_paths = os_utils.extract_files_path_from_directories_or_subdirectories(
        pdfs_parent_dir_path)
    print(f"Total number of articles: {len(articles_paths)}")
    validated_list, invalidated_list, manual_list = validating_multiple_pdfs_via_filenames(articles_paths)
    print(f"Using pdftotext reader to validate:")
    print(f"Number of validated articles : {len(validated_list)}\n"
          f"Number of invalidated articles : {len(invalidated_list)}\n "
          f"Number of articles to open manually: {len(manual_list)}")
    print("validating invalidated articles using other pdf reader.")

    temp_invalidated_list, temp_manual_list = [], []
    if len(invalidated_list) != 0:
        temp_invalidated_list = getting_article_paths_from_validation_detail(invalidated_list)
    if len(manual_list) != 0:
        temp_manual_list = getting_article_paths_from_validation_detail(manual_list)
    invalidated_list = temp_invalidated_list + temp_manual_list
    temp_validated_list, temp_invalidated_list, temp_manual_list = validating_multiple_pdfs_via_filenames(
        invalidated_list, pdf_reader="pymupdf")
    print(f"Using pymupdf reader to validate:")
    print(f"Number of validated articles : {len(temp_validated_list)}\n"
          f"Number of invalidated articles : {len(temp_invalidated_list)}\n "
          f"Number of articles to open manually: {len(temp_manual_list)}")

    validated_list += temp_validated_list
    invalidated_list = temp_invalidated_list
    manual_list = temp_manual_list
    print("Finally, using both python pdf readers:")
    print(f"Number of validated articles : {len(validated_list)}\n"
          f"Number of invalidated articles : {len(invalidated_list)}\n "
          f"Number of articles to open manually: {len(manual_list)}")

    return validated_list, invalidated_list, manual_list


[docs]def manual_validating_of_pdf(articles_path_list: list, manual_index: int) -> tuple:
    """This is mostly a manually used function to validate some pdfs at the end of validation process. It makes it easy
    to search and validate pdf and store in a list.
    Advice: convert these lists as text file using function in converter module to avoid data loss.

    Parameters
    ----------
    articles_path_list : list
        These are the list of articles which skipped our automated screening and validation algorithms. mostly due to
        pdf to text conversions errors.
    manual_index : list
        This is the index from where you will start checking in article_path_list. Normally in many tries.

    Returns
    -------
    tuple
        external_validation_list - This is the list to be saved externally for validated articles.
        external_invalidated_list - This is the list to be saved externally for invalidated articles.

    """
    external_validation_list = []
    external_invalidated_list = []
    article_path = articles_path_list[manual_index]
    print(article_path)
    instructions = "Please provide 'y' to validate or 'n' to invalidate"
    manual_input = input(instructions).lower()
    if manual_input.lower() == "y":
        external_validation_list.append(article_path)
    elif manual_input.lower() == "n":
        external_invalidated_list.append(article_path)
    else:
        print("input should be 'y' or 'n'")
    manual_index += 1
    return external_validation_list, external_invalidated_list


[docs]class Validation:
    """This is used to validate the downloaded files.

    """
    download_flag_column_name = 'downloaded'
    research_paper_file_location_column_name = 'file location'
    validation_method_column_name = "validation method"
    validation_manual_method_name = "manual"
    cleaned_article_column_name = 'cleaned_title'
    file_manual_check_flag_name = "unreadable"
    file_validated_flag_name = "yes"
    file_invalidated_flag_name = "wrong"
    file_not_downloaded_flag_name = "no"
    file_not_accessible_flag_name = "no access"

    def __init__(self, citations_data: Union[List[dict], pd.DataFrame],
                 parents_directory_of_research_papers_files: str,
                 text_file_path_of_inaccessible_research_papers: str = None,
                 text_manipulation_method_name: str = "preprocess_string_to_space_separated_words",
                 words_percentage_checker_in_text_validation_limit: float = 70,
                 jumbled_words_percentage_checker_in_text_validation_limit: float = 70,
                 jumbled_words_percentage_checker_in_text_wrong_word_limit: int = 2
                 ):
        """

        Parameters
        ----------
        citations_data : Union[List[dict], pd.DataFrame]
            This contains citation data which we are validating.
        parents_directory_of_research_papers_files : str
            This is parent directory of all the downloaded files for citation data.
        text_file_path_of_inaccessible_research_papers : str
            This is the path of text file containing non-accessible research papers separated by newline.
        text_manipulation_method_name : str
            provides the options to use any text manipulation function.
            preprocess_string (default and applied before all other implemented functions)
            custom_text_manipulation_function - for putting your custom_text_manipulation_function function to
            preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer,
            nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma,
            convert_string_to_lowercase, preprocess_string_to_space_separated_words
        jumbled_words_percentage_checker_in_text_wrong_word_limit : float
            This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.
        jumbled_words_percentage_checker_in_text_validation_limit : int
            This is the limit unto which algorithm ignore the wrong word in sequence.
        words_percentage_checker_in_text_validation_limit : float
            This is the limit on similarity of checked substring. Example - 0.5 will return true if half of word found same.

        """

        self.jumbled_words_percentage_checker_in_text_wrong_word_limit = \
            jumbled_words_percentage_checker_in_text_wrong_word_limit
        self.jumbled_words_percentage_checker_in_text_validation_limit = \
            jumbled_words_percentage_checker_in_text_validation_limit
        self.words_percentage_checker_in_text_validation_limit = words_percentage_checker_in_text_validation_limit
        self.text_manipulation_method_name = text_manipulation_method_name
        self.text_file_path_of_inaccessible_research_papers = text_file_path_of_inaccessible_research_papers
        self.parents_directory_of_research_papers_files = parents_directory_of_research_papers_files
        self.citations_records_list = converter.dataframe_to_records_list(citations_data) \
            if type(citations_data) == pd.DataFrame else citations_data
        self.research_papers_list = self.add_downloaded_flag_column_and_file_location_column()
        self.file_name_and_path_mapping = self.file_name_and_path_dict()

[docs]    def add_downloaded_flag_column_and_file_location_column(self):
        """add empty columns based on research_paper_file_location_column_name and download_flag_column_name

        Returns
        -------
        List[dict]
            data contains new columns.

        """
        import copy
        complete_citations_records_list = copy.deepcopy(self.citations_records_list)
        inaccessible_research_papers_set = set([string_manipulation.text_manipulation_methods(
            article_name, self.text_manipulation_method_name) for article_name in converter.text_file_to_list(
            self.text_file_path_of_inaccessible_research_papers)]) if \
            self.text_file_path_of_inaccessible_research_papers else self.text_file_path_of_inaccessible_research_papers

        for record in complete_citations_records_list:
            if inaccessible_research_papers_set and \
                    (record[self.cleaned_article_column_name] in inaccessible_research_papers_set):
                record[self.download_flag_column_name] = self.file_not_accessible_flag_name
            else:
                record[self.download_flag_column_name] = self.file_not_downloaded_flag_name
            record[self.research_paper_file_location_column_name] = ""
            record[self.validation_method_column_name] = ""

        return complete_citations_records_list

[docs]    def file_name_and_path_dict(self):
        """contains mapping of filename to file paths

        Returns
        -------
        dict
            key is filename and value is file paths.

        """
        file_name_and_path = {}
        articles_paths = os_utils.extract_files_path_from_directories_or_subdirectories(
            self.parents_directory_of_research_papers_files)

        for path in articles_paths:
            article_name = os_utils.get_filename_from_path(path)

            clean_article_name = string_manipulation.text_manipulation_methods(
                article_name, self.text_manipulation_method_name)

            file_name_and_path[clean_article_name] = path

        return file_name_and_path

[docs]    def check(self):
        """Executes the validation of research articles in citation data by checking the research paper files and
        validating if the research articles are correct.

        Returns
        -------
        List[dict]
            data contains columns with downloaded, validation method and file path columns

        """

        for citation in self.research_papers_list:

            if (citation[self.download_flag_column_name].lower() == "no") and (
                    citation[self.cleaned_article_column_name] in self.file_name_and_path_mapping):

                research_paper = converter.Reader(
                    self.file_name_and_path_mapping[citation[self.cleaned_article_column_name]])
                file_extension = research_paper.file_extension

                if file_extension == 'pdf':
                    text = research_paper.pdf_pdftotext_reader()
                    if text:
                        validation_result = ValidateWordsInText(
                            citation[self.cleaned_article_column_name], text,
                            self.words_percentage_checker_in_text_validation_limit,
                            self.jumbled_words_percentage_checker_in_text_validation_limit,
                            self.jumbled_words_percentage_checker_in_text_wrong_word_limit).multiple_methods()
                        if validation_result[0]:
                            citation[self.download_flag_column_name] = self.file_validated_flag_name
                            citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
                                citation[self.cleaned_article_column_name]]
                            citation[self.validation_method_column_name] = validation_result[2]
                            continue

                    text = research_paper.pdf_pymupdf_reader()
                    if not text:
                        citation[self.download_flag_column_name] = self.file_manual_check_flag_name
                        continue

                    validation_result = ValidateWordsInText(
                        citation[self.cleaned_article_column_name], text,
                        self.words_percentage_checker_in_text_validation_limit,
                        self.jumbled_words_percentage_checker_in_text_validation_limit,
                        self.jumbled_words_percentage_checker_in_text_wrong_word_limit).multiple_methods()

                    if validation_result[0]:
                        citation[self.download_flag_column_name] = self.file_validated_flag_name
                        citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
                            citation[self.cleaned_article_column_name]]
                        citation[self.validation_method_column_name] = validation_result[2]
                    else:
                        citation[self.download_flag_column_name] = self.file_invalidated_flag_name
                        citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
                            citation[self.cleaned_article_column_name]]
                        citation[self.validation_method_column_name] = validation_result[2]
                else:
                    text = research_paper.get_text()
                    if not text:
                        citation[self.download_flag_column_name] = self.file_manual_check_flag_name
                        continue

                    validation_result = ValidateWordsInText(
                        citation[self.cleaned_article_column_name], text,
                        self.words_percentage_checker_in_text_validation_limit,
                        self.jumbled_words_percentage_checker_in_text_validation_limit,
                        self.jumbled_words_percentage_checker_in_text_wrong_word_limit).multiple_methods()

                    if validation_result[0]:
                        citation[self.download_flag_column_name] = self.file_validated_flag_name
                        citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
                            citation[self.cleaned_article_column_name]]
                        citation[self.validation_method_column_name] = validation_result[2]
                    else:
                        citation[self.download_flag_column_name] = self.file_invalidated_flag_name
                        citation[self.research_paper_file_location_column_name] = self.file_name_and_path_mapping[
                            citation[self.cleaned_article_column_name]]
                        citation[self.validation_method_column_name] = validation_result[2]

        return self.research_papers_list

[docs]    def get_records_list(self) -> List[Dict[str, Any]]:
        """Outputs the records list containing validation results of input data.

        Returns
        -------
        List[Dict[str, Any]]
            This is the list of records which contains validation Flags column downloaded with values-  "yes", "no",
            "wrong", "no access", "unreadable" and file location column if downloaded column contains "yes".

        """
        return self.check()

[docs]    def get_dataframe(self):
        """Outputs the pandas.DataFrame containing validation results of input data.

        Returns
        -------
        pandas.DataFrame
            This is the dataframe which contains validation Flags column downloaded with values-  "yes", "no",
            "wrong", "no access", "unreadable" and file location column if downloaded column contains "yes".

        """
        return converter.records_list_to_dataframe(self.check())

[docs]    def info(self):
        """Equivalent to pandas.DataFrame.value_counts(), It return list with count of unique element in column

        Returns
        -------
        object
            unique download_flag_column_name elements with counts

        """
        return converter.dataframe_column_counts(self.get_dataframe(), self.download_flag_column_name)

[docs]    def to_csv(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
        """This function saves pandas.DataFrame to csv file.

        Parameters
        ----------
        output_filename : str
            This is the name of output file which should contains .csv extension
        index : bool
            Define if index is needed in output csv file or not.

        Returns
        -------

        """
        converter.dataframe_to_csv_file(self.get_dataframe(), output_filename, index)

[docs]    def to_excel(self, output_filename: Union[str, None] = "output.csv", index: bool = True):
        """This function saves pandas.DataFrame to excel file.

        Parameters
        ----------
        output_filename : str
            This is the name of output file which should contains .xlsx extension
        index : bool
            Define if index is needed in output excel file or not.

        Returns
        -------

        """
        converter.dataframe_to_excel_file(self.get_dataframe(), output_filename, index)