Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #1888 from getnikola/import-wordpress-more-attachm…
…ent-metadata

Adding more attachment metadata for WordPress importer.
  • Loading branch information
felixfontein committed Jul 13, 2015
2 parents 99f990d + 1392fcb commit afe1629
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 54 deletions.
2 changes: 1 addition & 1 deletion CHANGES.txt
Expand Up @@ -17,7 +17,7 @@ Features
converting posts to markdown with --use-wordpress-compiler
* Allowing to automatically install the WordPress page compiler when
needed with --install-wordpress-compiler
* Exporting information on attachments per post as JSON
* Exporting information on attachments per post as JSON (#1867 and #1888)
* Exporting post status and excerpt
* New ‘pagekind’ variable available to identify different kind of pages from theme templates
* Add ``--no-server`` option to ``nikola auto`` (Issue #1883)
Expand Down
172 changes: 121 additions & 51 deletions nikola/plugins/command/import_wordpress.py
Expand Up @@ -439,6 +439,7 @@ def download_url_content_to_file(self, url, dst_path):
LOGGER.warn("Downloading {0} to {1} failed: {2}".format(url, dst_path, err))

def import_attachment(self, item, wordpress_namespace):
# Download main image
url = get_text_tag(
item, '{{{0}}}attachment_url'.format(wordpress_namespace), 'foo')
link = get_text_tag(item, '{{{0}}}link'.format(wordpress_namespace),
Expand All @@ -453,62 +454,131 @@ def import_attachment(self, item, wordpress_namespace):
links[link] = '/' + dst_url
links[url] = '/' + dst_url

result = {}
result['files'] = [path] + self.download_additional_image_sizes(
item,
wordpress_namespace,
os.path.dirname(url)
)
return result

def download_additional_image_sizes(self, item, wordpress_namespace, source_path):
if phpserialize is None:
return []
files = [path]
files_meta = [{}]

additional_metadata = item.findall('{{{0}}}postmeta'.format(wordpress_namespace))
if additional_metadata is None:
return []

result = []
for element in additional_metadata:
meta_key = element.find('{{{0}}}meta_key'.format(wordpress_namespace))
if meta_key is not None and meta_key.text == '_wp_attachment_metadata':
meta_value = element.find('{{{0}}}meta_value'.format(wordpress_namespace))
if phpserialize and additional_metadata:
source_path = os.path.dirname(url)
for element in additional_metadata:
meta_key = element.find('{{{0}}}meta_key'.format(wordpress_namespace))
if meta_key is not None and meta_key.text == '_wp_attachment_metadata':
meta_value = element.find('{{{0}}}meta_value'.format(wordpress_namespace))

if meta_value is None:
continue

# Someone from Wordpress thought it was a good idea
# serialize PHP objects into that metadata field. Given
# that the export should give you the power to insert
# your blogging into another site or system its not.
# Why don't they just use JSON?
if sys.version_info[0] == 2:
try:
metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
except ValueError:
# local encoding might be wrong sometimes
metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
else:
metadata = phpserialize.loads(meta_value.text.encode('utf-8'))

if meta_value is None:
continue
meta_key = b'image_meta'
size_key = b'sizes'
file_key = b'file'
width_key = b'width'
height_key = b'height'

# Extract metadata
if width_key in metadata and height_key in metadata:
files_meta[0]['width'] = int(metadata[width_key])
files_meta[0]['height'] = int(metadata[height_key])

if meta_key in metadata:
image_meta = metadata[meta_key]
dst_meta = {}

def add(our_key, wp_key, is_int=False, ignore_zero=False):
if wp_key in image_meta:
value = image_meta[wp_key]
if is_int:
value = int(value)
if ignore_zero and value == 0:
return
else:
value = value.decode('utf-8') # assume UTF-8
if value == '': # skip empty values
return
dst_meta[our_key] = value

add('aperture', b'aperture', is_int=True, ignore_zero=True)
add('credit', b'credit')
add('camera', b'camera')
add('caption', b'caption')
add('created_timestamp', b'created_timestamp', is_int=True, ignore_zero=True)
add('copyright', b'copyright')
add('focal_length', b'focal_length', is_int=True, ignore_zero=True)
add('iso', b'iso', is_int=True, ignore_zero=True)
add('shutter_speed', b'shutter_speed', is_int=True, ignore_zero=True)
add('title', b'title')

if len(dst_meta) > 0:
files_meta[0]['meta'] = dst_meta

# Find other sizes of image
if size_key not in metadata:
continue

for size in metadata[size_key]:
filename = metadata[size_key][size][file_key]
url = '/'.join([source_path, filename.decode('utf-8')])

# Construct metadata
meta = {}
meta['size'] = size.decode('utf-8')
if width_key in metadata[size_key][size] and height_key in metadata[size_key][size]:
meta['width'] = metadata[size_key][size][width_key]
meta['height'] = metadata[size_key][size][height_key]

path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
dst_dir = os.path.dirname(dst_path)
utils.makedirs(dst_dir)
LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[url] = '/' + dst_url

files.append(path)
files_meta.append(meta)

# Prepare result
result = {}
result['files'] = files
result['files_meta'] = files_meta

# Someone from Wordpress thought it was a good idea
# serialize PHP objects into that metadata field. Given
# that the export should give you the power to insert
# your blogging into another site or system its not.
# Why don't they just use JSON?
if sys.version_info[0] == 2:
try:
metadata = phpserialize.loads(utils.sys_encode(meta_value.text))
except ValueError:
# local encoding might be wrong sometimes
metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
else:
metadata = phpserialize.loads(meta_value.text.encode('utf-8'))
size_key = b'sizes'
file_key = b'file'
# Prepare extraction of more information
dc_namespace = item.nsmap['dc']
content_namespace = item.nsmap['content']
excerpt_namespace = item.nsmap['excerpt']

if size_key not in metadata:
continue
def add(result_key, key, namespace=None, filter=None, store_empty=False):
if namespace is not None:
value = get_text_tag(item, '{{{0}}}{1}'.format(namespace, key), None)
else:
value = get_text_tag(item, key, None)
if value is not None:
if filter:
value = filter(value)
if value or store_empty:
result[result_key] = value

add('title', 'title')
add('date_utc', 'post_date_gmt', namespace=wordpress_namespace)
add('wordpress_user_name', 'creator', namespace=dc_namespace)
add('content', 'encoded', namespace=content_namespace)
add('excerpt', 'encoded', namespace=excerpt_namespace)
add('description', 'description')

for filename in [metadata[size_key][size][file_key] for size in metadata[size_key]]:
url = '/'.join([source_path, filename.decode('utf-8')])

path = urlparse(url).path
dst_path = os.path.join(*([self.output_folder, 'files'] + list(path.split('/'))))
dst_dir = os.path.dirname(dst_path)
utils.makedirs(dst_dir)
LOGGER.info("Downloading {0} => {1}".format(url, dst_path))
self.download_url_content_to_file(url, dst_path)
dst_url = '/'.join(dst_path.split(os.sep)[2:])
links[url] = '/' + dst_url
result.append(path)
return result

code_re1 = re.compile(r'\[code.* lang.*?="(.*?)?".*\](.*?)\[/code\]', re.DOTALL | re.MULTILINE)
Expand Down Expand Up @@ -878,7 +948,7 @@ def import_posts(self, channel):
# Assign attachments to posts
for post_id in self.attachments:
LOGGER.warn(("Found attachments for post or page #{0}, but didn't find post or page. " +
"(Attachments: {1})").format(post_id, [e[0] for _, e in self.attachments[post_id].items()]))
"(Attachments: {1})").format(post_id, [e['files'][0] for e in self.attachments[post_id].values()]))


def get_text_tag(tag, name, default):
Expand Down
11 changes: 9 additions & 2 deletions tests/test_command_import_wordpress.py
Expand Up @@ -250,8 +250,15 @@ def test_importing_posts_and_attachments(self):

self.assertTrue(write_attachments_info.called)
write_attachments_info.assert_any_call('new_site/posts/2008/07/arzt-und-pfusch-s-i-c-k.attachments.json'.replace('/', os.sep),
{10: ['/wp-content/uploads/2008/07/arzt_und_pfusch-sick-cover.png',
'/wp-content/uploads/2008/07/arzt_und_pfusch-sick-cover-150x150.png']})
{10: {'wordpress_user_name': 'Niko',
'files_meta': [{'width': 300, 'height': 299},
{'width': b'150', 'size': 'thumbnail', 'height': b'150'}],
'excerpt': 'Arzt+Pfusch - S.I.C.K.',
'date_utc': '2009-07-16 19:40:37',
'content': 'Das Cover von Arzt+Pfusch - S.I.C.K.',
'files': ['/wp-content/uploads/2008/07/arzt_und_pfusch-sick-cover.png',
'/wp-content/uploads/2008/07/arzt_und_pfusch-sick-cover-150x150.png'],
'title': 'Arzt+Pfusch - S.I.C.K.'}})

write_content.assert_any_call(
'new_site/posts/2008/07/arzt-und-pfusch-s-i-c-k.md'.replace('/', os.sep),
Expand Down

0 comments on commit afe1629

Please sign in to comment.