Source code for systematic_review.string_manipulation

"""Module: string_manipulation
This module contains functions related to string case change, preprocess, and removing some part of it.
"""

import unicodedata
from typing import Callable, Any
from systematic_review import os_utils, nlp


[docs]def string_dict_to_lower(string_map: dict) -> dict: """ this convert the values into lowercase. similar function for list is available as string_list_to_lower() Parameters ---------- string_map : dict these are key:values pairs needed to be converted. Returns ------- dict output by converting input to key: lowercase values. """ lower_string_map = dict() for key, value in string_map.items(): lower_string_map[key] = str(value).lower().replace("\n", " ") return lower_string_map
[docs]def string_list_to_lower(string_list: list) -> list: """ this convert the values into lowercase. similar function for dict is available as string_dict_to_lower() Parameters ---------- string_list : list this list contains input string need to be converted to lowercase. Returns ------- list this is the output list which contains original input strings but in lowercase """ lower_string_list = list() for string in string_list: lower_string_list.append(str(string).lower().replace("\n", " ")) return lower_string_list
[docs]def string_to_space_separated_words(text: str) -> str: """takes text string and outputs space separated words. Parameters ---------- text : str This text contains multiple spaces or trailing whitespaces Returns ------- str This is space separated word string with no trailing whitespaces. """ temp_text = text.split() return " ".join(temp_text)
[docs]def remove_non_ascii(string_list: list) -> list: """Remove non-ASCII characters from list of tokenized words Parameters ---------- string_list : list this list contains the words which contains the non-ASCII characters Returns ------- list this is modified list after removing the non-ASCII characters """ new_words = [] for word in string_list: new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') new_words.append(new_word) return new_words
[docs]def split_words_remove_duplicates(string_list: list) -> list: """ this function takes a list of words or sentences and split them to individual words. It also removes any repeating word in the list. Parameters ---------- string_list : list this is the input list which contains words and group of words inside. Example - ['one', 'one two'] Returns ------- list this is the output list which contains only unique individual words using set(). Example - ['one', 'two'] """ temp_set = set() for string in string_list: for word in string.split(): temp_set.add(word) modified_list = list(temp_set) return modified_list
[docs]def preprocess_string(string: str) -> str: """replace symbols in string with spaces and Lowercase the given input string. Example - 'Df%$df' -> 'df df' Parameters ---------- string : str This is input word string which contains unwanted symbols and might have uppercase characters in it. Returns ------- str This is cleaned string from symbols and contains only alpha characters. """ string = replace_symbols_with_space(string) string = convert_string_to_lowercase(string) return string
[docs]def preprocess_string_to_space_separated_words(string: str) -> str: """replace symbols in string with spaces and Lowercase the given input string. Example - 'Df%$df' -> 'df df' and convert 'df df' to single spaced 'df df'. Parameters ---------- string : str This can contain string words mixed with spaces and symbols. Returns ------- str remove the spaces and symbols and arrange the words single spaces. """ string = preprocess_string(string) string = string_to_space_separated_words(string) return string
[docs]def replace_symbols_with_space(string: str) -> str: """replace symbols in string with spaces. Example - 'df%$df' -> 'df df' Parameters ---------- string : str This is input word string which contains unwanted symbols. Returns ------- str This is cleaned string from symbols and contains only alpha characters and all lowercase character string. """ alpha = "" for character in string: if character.isalpha(): alpha += character elif character == " ": alpha += character else: alpha += " " return alpha
[docs]def convert_string_to_lowercase(string: str) -> str: """Lowercase the given input string. Parameters ---------- string : str The string which might have uppercase characters in it. Returns ------- str This is all lowercase character string. """ return string.lower()
[docs]def split_preprocess_string(text: str) -> list: """This splits the words into list after applying preprocess function from string_manipulation module. Parameters ---------- text : str This is input word string which contains unwanted symbols and might have uppercase characters in it. Returns ------- list This is cleaned list of strings from symbols and contains only alpha characters. """ clean_text = preprocess_string(text) text_list = clean_text.split() return text_list
[docs]def pdf_filename_from_filepath(article_path: str) -> str: """This takes the pdf path as input and clean the name of pdf by applying preprocess function from string_manipulation module. Parameters ---------- article_path : str This is the path of the pdf file. Returns ------- str This is the cleaned filename of the pdf. """ article_filename = os_utils.get_filename_from_path(article_path) article_name = strip_string_from_right_side(article_filename) return article_name
[docs]def strip_string_from_right_side(string: str, value_to_be_stripped: str = ".pdf") -> str: """Function removes the substring from the right of string. Parameters ---------- string : str This is the complete word or string. Example - 'monster.pdf' value_to_be_stripped : str This is the value which is needed to be removed from right side. Example - '.pdf' Returns ------- str This is the trimmed string that contains the left part after some part removed from the right. Example - 'monster' """ stripped_string = string.rstrip(value_to_be_stripped) return stripped_string
[docs]def text_manipulation_methods(text: str, text_manipulation_method_name: str = "preprocess_string", custom_text_manipulation_function: Callable[[str, Any, Any], str] = None, *args, **kwargs) -> str: """This convert text or string using options like preprocess, nlp module function, for more info each respective methods methods implemented. args and kwargs will go into custom_text_manipulation_function Parameters ---------- kwargs : Dict[str, Any] These key = word or {key: word} arguments are for custom_text_manipulation_function args : Tuple These arguments are for custom_text_manipulation_function custom_text_manipulation_function : Callable[[str, Any, Any], str] This is optional custom_text_manipulation_function function if you want to implement this yourself. pass as custom_text_manipulation_function = function_name. it will take text as parameter with no default preprocess_string operation. text : str string type text which is needed to be converted. text_manipulation_method_name : str provides the options to use any text manipulation function. preprocess_string (default and applied before all other implemented functions) custom_text_manipulation_function - for putting your custom_text_manipulation_function function to preprocess the text nltk_remove_stopwords, pattern_lemma_or_lemmatize_text, nltk_word_net_lemmatizer, nltk_porter_stemmer, nltk_lancaster_stemmer, spacy_lemma, nltk_remove_stopwords_spacy_lemma, convert_string_to_lowercase, preprocess_string_to_space_separated_words Returns ------- str this return the converted text """ preprocessed_text = preprocess_string(text) if text_manipulation_method_name.lower() == "preprocess_string": return preprocessed_text elif text_manipulation_method_name.lower() == "convert_string_to_lowercase": return convert_string_to_lowercase(text) elif text_manipulation_method_name.lower() == "custom_text_manipulation_function": return custom_text_manipulation_function(text, args, kwargs) elif text_manipulation_method_name.lower() == "preprocess_string_to_space_separated_words": return preprocess_string_to_space_separated_words(text) elif text_manipulation_method_name.lower() == "nltk_remove_stopwords": return nlp.nltk_remove_stopwords(preprocessed_text) elif text_manipulation_method_name.lower() == "pattern_lemma_or_lemmatize_text": return nlp.pattern_lemma_or_lemmatize_text(preprocessed_text) elif text_manipulation_method_name.lower() == "nltk_word_net_lemmatizer": return nlp.nltk_word_net_lemmatizer(preprocessed_text) elif text_manipulation_method_name.lower() == "nltk_porter_stemmer": return nlp.nltk_porter_stemmer(preprocessed_text) elif text_manipulation_method_name.lower() == "nltk_lancaster_stemmer": return nlp.nltk_lancaster_stemmer(preprocessed_text) elif text_manipulation_method_name.lower() == "spacy_lemma": return nlp.spacy_lemma(preprocessed_text) elif text_manipulation_method_name.lower() == "nltk_remove_stopwords_spacy_lemma": return nlp.nltk_remove_stopwords_spacy_lemma(preprocessed_text) else: raise NotImplementedError("Not implemented yet.")