split_text.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. # -*- coding: utf-8 -*-
  2. import re
  3. def split_text_to_word(text: str):
  4. words_list = re.findall(r'\b[-\'\w]+\b', text)
  5. return words_list
  6. def get_article_words_count(text: str):
  7. return len(split_text_to_word(text))
  8. def split_text_to_sentences(text: str) -> list:
  9. sentences = re.split(r'(?<=[.!?;])', text)
  10. sentences = [i for i in sentences if i.replace(" ", "")]
  11. return sentences
  12. def split_text_to_word_punctuation(text: str):
  13. word_punctuation_list = re.findall(r'\b[-\'\w]+\b|[^\w\s]|\n', text)
  14. return word_punctuation_list
  15. def is_word(single_word: str, strict: bool = False):
  16. """strict 严格模式,默认不开。严格模式下,每个实体字符必须是字母。全部都是字母才算是单词
  17. 非严格模式下,有一个字母就算是单词。即使是 op123
  18. """
  19. single_word = single_word.strip()
  20. if strict:
  21. r = all([re.search(r'[a-zA-Z]', char_) for char_ in single_word if char_])
  22. if r:
  23. return True
  24. return False
  25. if re.search(r'[a-zA-Z]', single_word):
  26. return True
  27. return False
  28. if __name__ == '__main__':
  29. a = "fdh fgdhf fgd-y i'am a student.gfddfgfd dfhgfd ! fdgh,fdgh fght. 3.1415"
  30. print(is_word("student34", strict=True))