:py:mod:`generic.utils` ======================= .. py:module:: generic.utils .. autodoc2-docstring:: generic.utils :allowtitles: Submodules ---------- .. toctree:: :titlesonly: :maxdepth: 1 generic.utils.text_parser Package Contents ---------------- Functions ~~~~~~~~~ .. list-table:: :class: autosummary longtable :align: left * - :py:obj:`get_meta_property ` - .. autodoc2-docstring:: generic.utils.get_meta_property :summary: * - :py:obj:`extract_article ` - .. autodoc2-docstring:: generic.utils.extract_article :summary: * - :py:obj:`idn2ascii ` - .. autodoc2-docstring:: generic.utils.idn2ascii :summary: * - :py:obj:`get_uniform_metadata ` - .. autodoc2-docstring:: generic.utils.get_uniform_metadata :summary: * - :py:obj:`str_to_isoformat ` - .. autodoc2-docstring:: generic.utils.str_to_isoformat :summary: * - :py:obj:`get_metadata ` - .. autodoc2-docstring:: generic.utils.get_metadata :summary: * - :py:obj:`count_xml_character ` - .. autodoc2-docstring:: generic.utils.count_xml_character :summary: * - :py:obj:`generate_hashed_filename ` - .. autodoc2-docstring:: generic.utils.generate_hashed_filename :summary: * - :py:obj:`is_path_matched ` - .. autodoc2-docstring:: generic.utils.is_path_matched :summary: * - :py:obj:`is_file_url ` - .. autodoc2-docstring:: generic.utils.is_file_url :summary: * - :py:obj:`get_url_without_fragment ` - .. autodoc2-docstring:: generic.utils.get_url_without_fragment :summary: * - :py:obj:`analyze_text_with_spacy ` - .. autodoc2-docstring:: generic.utils.analyze_text_with_spacy :summary: * - :py:obj:`tokens_include_predicate ` - .. autodoc2-docstring:: generic.utils.tokens_include_predicate :summary: API ~~~ .. py:function:: get_meta_property(response: scrapy.http.Response, name: str) -> str :canonical: generic.utils.get_meta_property .. autodoc2-docstring:: generic.utils.get_meta_property .. py:function:: extract_article(res: scrapy.http.Response) -> dict :canonical: generic.utils.extract_article .. autodoc2-docstring:: generic.utils.extract_article .. py:function:: idn2ascii(url_str: str) -> str :canonical: generic.utils.idn2ascii .. autodoc2-docstring:: generic.utils.idn2ascii .. py:function:: get_uniform_metadata(html: str, base_url: str) :canonical: generic.utils.get_uniform_metadata .. autodoc2-docstring:: generic.utils.get_uniform_metadata .. py:function:: str_to_isoformat(string: str) :canonical: generic.utils.str_to_isoformat .. autodoc2-docstring:: generic.utils.str_to_isoformat .. py:function:: get_metadata(res: scrapy.http.Response) -> dict :canonical: generic.utils.get_metadata .. autodoc2-docstring:: generic.utils.get_metadata .. py:function:: count_xml_character(xml_string: str) -> int :canonical: generic.utils.count_xml_character .. autodoc2-docstring:: generic.utils.count_xml_character .. py:function:: generate_hashed_filename(url, domain_size: int = 8, url_size: int = 32, max_len: int = 255) -> str :canonical: generic.utils.generate_hashed_filename .. autodoc2-docstring:: generic.utils.generate_hashed_filename .. py:function:: is_path_matched(url: str, regexp: str) -> bool :canonical: generic.utils.is_path_matched .. autodoc2-docstring:: generic.utils.is_path_matched .. py:function:: is_file_url(url: str, regexp: str = '(?:/|\\.html?|\\.php|\\.aspx?|/[^./]+)$') -> bool :canonical: generic.utils.is_file_url .. autodoc2-docstring:: generic.utils.is_file_url .. py:function:: get_url_without_fragment(url_string: str) -> str :canonical: generic.utils.get_url_without_fragment .. autodoc2-docstring:: generic.utils.get_url_without_fragment .. py:function:: analyze_text_with_spacy(client: httpx.AsyncClient, text: str, url: str) :canonical: generic.utils.analyze_text_with_spacy :async: .. autodoc2-docstring:: generic.utils.analyze_text_with_spacy .. py:function:: tokens_include_predicate(tokens) :canonical: generic.utils.tokens_include_predicate .. autodoc2-docstring:: generic.utils.tokens_include_predicate