split_text.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. # -*- coding: utf-8 -*-
  2. import re
  3. def split_text_to_word(text:str,split_hyphen=False):
  4. """
  5. split_hyphen: 是否分拆-连字符,默认不拆
  6. """
  7. if split_hyphen:
  8. words_list = re.findall(r'\b[\'\w]+\b',text)
  9. else:
  10. words_list = re.findall(r'\b[-\'’\w]+\b', text)
  11. return words_list
  12. def get_article_words_count(text:str):
  13. return len(split_text_to_word(text))
  14. def split_text_to_sentences(text:str) -> list:
  15. sentences = re.split(r'(?<=[.!?;])', text)
  16. sentences = [i for i in sentences if i.replace(" ", "")]
  17. return sentences
  18. def split_text_to_word_punctuation(text:str):
  19. word_punctuation_list = re.findall(r'\b[-\'\w]+\b|[^\w\s]|\n',text)
  20. return word_punctuation_list
  21. def is_word(single_word:str,strict:bool=False):
  22. """strict 严格模式,默认不开。严格模式下,每个实体字符必须是字母。全部都是字母才算是单词
  23. 非严格模式下,有一个字母就算是单词。即使是 op123,it's
  24. """
  25. single_word = single_word.strip()
  26. if strict:
  27. r = all([re.search(r'[a-zA-Z]', char_) for char_ in single_word if char_])
  28. if r:
  29. return True
  30. return False
  31. if re.search(r'[\'a-zA-Z]', single_word):
  32. return True
  33. return False
  34. if __name__ == '__main__':
  35. a = "fdh fgdhf fgd-y i'am a student.gfddfgfd dfhgfd ! fdgh,fdgh fght. 3.1415"
  36. print(is_word("student34",strict=True))