Extracting raw metadata extraction/split functions to utils package.

getnikola · Jul 1, 2017 · 2c090ae · 2c090ae
1 parent c94951d
commit 2c090ae
Show file tree

Hide file tree

Showing 3 changed files with 115 additions and 69 deletions.
diff --git a/nikola/plugin_categories.py b/nikola/plugin_categories.py
@@ -28,13 +28,12 @@
 
 import sys
 import os
-import re
 import io
 
 from yapsy.IPlugin import IPlugin
 from doit.cmd_base import Command as DoitCommand
 
-from .utils import LOGGER, first_line
+from .utils import LOGGER, first_line, split_metadata
 
 __all__ = (
     'Command',
@@ -329,16 +328,8 @@ def split_metadata(self, data):
         This splits in the first empty line that is NOT at the beginning
         of the document, or after YAML/TOML metadata without an empty line.
         """
-        if data.startswith('---'):  # YAML metadata
-            split_result = re.split('(\n---\n|\r\n---\r\n)', data.lstrip(), maxsplit=1)
-        elif data.startswith('+++'):  # TOML metadata
-            split_result = re.split('(\n\\+\\+\\+\n|\r\n\\+\\+\\+\r\n)', data.lstrip(), maxsplit=1)
-        else:
-            split_result = re.split('(\n\n|\r\n\r\n)', data.lstrip(), maxsplit=1)
-        if len(split_result) == 1:
-            return '', split_result[0]
-        # ['metadata', '\n\n', 'post content']
-        return split_result[0], split_result[-1]
+        meta, content, _ = split_metadata(data)
+        return meta, content
 
     def get_compiler_extensions(self):
         """Activate all the compiler extension plugins for a given compiler and return them."""

diff --git a/nikola/post.py b/nikola/post.py
@@ -937,19 +937,8 @@ def source_ext(self, prefix=False):
 # Code that fetches metadata from different places
 
 
-def re_meta(line, match=None):
-    """Find metadata using regular expressions."""
-    if match:
-        reStr = re.compile('^\.\. {0}: (.*)'.format(re.escape(match)))
-    else:
-        reStr = re.compile('^\.\. (.*?): (.*)')
-    result = reStr.findall(line.strip())
-    if match and result:
-        return (match, result[0])
-    elif not match and result:
-        return (result[0][0], result[0][1].strip())
-    else:
-        return (None,)
+# For backwards compatibility
+re_meta = utils.re_meta
 
 
 def _get_metadata_from_filename_by_regex(filename, metadata_regexp, unslugify_titles, lang):
@@ -982,8 +971,8 @@ def get_metadata_from_file(source_path, config=None, lang=None):
         elif lang:
             source_path += '.' + lang
         with io.open(source_path, "r", encoding="utf-8-sig") as meta_file:
-            meta_data = [x.strip() for x in meta_file.readlines()]
-        return _get_metadata_from_file(meta_data, config)
+            file_lines = [x.strip() for x in meta_file.readlines()]
+        return _get_metadata_from_file(file_lines, config)
     except (UnicodeDecodeError, UnicodeEncodeError):
         msg = 'Error reading {0}: Nikola only supports UTF-8 files'.format(source_path)
         LOGGER.error(msg)
@@ -999,52 +988,15 @@ def get_metadata_from_file(source_path, config=None, lang=None):
     string.punctuation)))
 
 
-def _get_metadata_from_file(meta_data, config=None):
+def _get_metadata_from_file(file_lines, config=None):
     """Extract metadata from a post's source file."""
-    meta = {}
-    if not meta_data:
-        return meta
-
-    # Skip up to one empty line at the beginning (for txt2tags)
-    if not meta_data[0]:
-        meta_data = meta_data[1:]
-
-    # If 1st line is '---', then it's YAML metadata
-    if meta_data[0] == '---':
-        if yaml is None:
-            utils.req_missing('pyyaml', 'use YAML metadata', optional=True)
-            raise ValueError('Error parsing metadata')
-        idx = meta_data.index('---', 1)
-        meta = yaml.safe_load('\n'.join(meta_data[1:idx]))
-        # We expect empty metadata to be '', not None
-        for k in meta:
-            if meta[k] is None:
-                meta[k] = ''
+    meta, type = utils.extract_metadata(file_lines)
+    if type == 'yaml':
         # Map metadata from other platforms to names Nikola expects (Issue #2817)
         map_metadata(meta, 'yaml', config)
-        return meta
-
-    # If 1st line is '+++', then it's TOML metadata
-    if meta_data[0] == '+++':
-        if toml is None:
-            utils.req_missing('toml', 'use TOML metadata', optional=True)
-            raise ValueError('Error parsing metadata')
-        idx = meta_data.index('+++', 1)
-        meta = toml.loads('\n'.join(meta_data[1:idx]))
+    if type == 'toml':
         # Map metadata from other platforms to names Nikola expects (Issue #2817)
         map_metadata(meta, 'toml', config)
-        return meta
-
-    # First, get metadata from the beginning of the file,
-    # up to first empty line
-
-    for i, line in enumerate(meta_data):
-        if not line:
-            break
-        match = re_meta(line)
-        if match[0]:
-            meta[match[0]] = match[1]
-
     return meta
 
 

diff --git a/nikola/utils.py b/nikola/utils.py
@@ -97,7 +97,7 @@
            'adjust_name_for_index_path', 'adjust_name_for_index_link',
            'NikolaPygmentsHTML', 'create_redirect', 'clean_before_deployment',
            'sort_posts', 'indent', 'load_data', 'html_unescape', 'rss_writer',
-           'map_metadata',
+           'map_metadata', 're_meta', 'extract_metadata', 'split_metadata',
            # Deprecated, moved to hierarchy_utils:
            'TreeNode', 'clone_treenode', 'flatten_tree_structure',
            'sort_classifications', 'join_hierarchical_category_path',
@@ -2028,3 +2028,106 @@ def read_from_config(self, site, basename, posts_per_classification_per_language
         args = {'translation_manager': self, 'site': site,
                 'posts_per_classification_per_language': posts_per_classification_per_language}
         signal('{}_translations_config'.format(basename.lower())).send(args)
+
+
+def re_meta(line, match=None):
+    """Find metadata using regular expressions."""
+    if match:
+        reStr = re.compile('^\.\. {0}: (.*)'.format(re.escape(match)))
+    else:
+        reStr = re.compile('^\.\. (.*?): (.*)')
+    result = reStr.findall(line.strip())
+    if match and result:
+        return (match, result[0])
+    elif not match and result:
+        return (result[0][0], result[0][1].strip())
+    else:
+        return (None,)
+
+
+def extract_metadata(file_lines):
+    """Extract metadata from the lines of a file.
+
+    Returns a pair ``(meta, type)``, where ``meta`` is the
+    metadata dictionary and ``type`` the metadata format.
+
+    Valid values for ``type`` are:
+    * ``'none'``: no metadata was found (file was empty)
+    * ``'yaml'``: metadata in YAML format
+    * ``'toml'``: metadata in TOML format
+    * ``'rest'``: metadata in reST format (the standard Nikola
+                  reST-like metadata format)
+    """
+    meta = {}
+    if not file_lines:
+        return meta, 'none'
+
+    # Skip up to one empty line at the beginning (for txt2tags)
+    if not file_lines[0]:
+        file_lines = file_lines[1:]
+
+    # If 1st line is '---', then it's YAML metadata
+    if file_lines[0] == '---':
+        if yaml is None:
+            req_missing('pyyaml', 'use YAML metadata', optional=True)
+            raise ValueError('Error parsing metadata')
+        idx = file_lines.index('---', 1)
+        meta = yaml.safe_load('\n'.join(file_lines[1:idx]))
+        # We expect empty metadata to be '', not None
+        for k in meta:
+            if meta[k] is None:
+                meta[k] = ''
+        return meta, 'yaml'
+
+    # If 1st line is '+++', then it's TOML metadata
+    if file_lines[0] == '+++':
+        if toml is None:
+            req_missing('toml', 'use TOML metadata', optional=True)
+            raise ValueError('Error parsing metadata')
+        idx = file_lines.index('+++', 1)
+        meta = toml.loads('\n'.join(file_lines[1:idx]))
+        return meta, 'toml'
+
+    # First, get metadata from the beginning of the file,
+    # up to first empty line
+
+    for i, line in enumerate(file_lines):
+        if not line:
+            break
+        match = re_meta(line)
+        if match[0]:
+            meta[match[0]] = match[1]
+
+    return meta, 'nikola'
+
+
+def split_metadata(self, data):
+    """Split data from metadata in the raw post content.
+
+    This splits in the first empty line that is NOT at the beginning
+    of the document, or after YAML/TOML metadata without an empty line.
+
+    Returns a tuple ``(meta, content, type)`` where ``meta`` and
+    ``content`` are parts of ``data``, and ``type`` is the metadata
+    format.
+
+    Valid values for ``type`` are:
+    * ``'none'``: no metadata was found (file was empty)
+    * ``'yaml'``: metadata in YAML format
+    * ``'toml'``: metadata in TOML format
+    * ``'rest'``: metadata in reST format (the standard Nikola
+                  reST-like metadata format)
+    """
+    if data.startswith('---'):  # YAML metadata
+        split_result = re.split('(\n---\n|\r\n---\r\n)', data.lstrip(), maxsplit=1)
+        type = 'yaml'
+    elif data.startswith('+++'):  # TOML metadata
+        split_result = re.split('(\n\\+\\+\\+\n|\r\n\\+\\+\\+\r\n)', data.lstrip(), maxsplit=1)
+        type = 'toml'
+    else:
+        split_result = re.split('(\n\n|\r\n\r\n)', data.lstrip(), maxsplit=1)
+        type = 'nikola'
+    if len(split_result) == 1:
+        return '', split_result[0], 'none'
+    # ['metadata', '\n\n', 'post content']
+    return split_result[0], split_result[-1], type