@@ -439,6 +439,7 @@ def download_url_content_to_file(self, url, dst_path):
439
439
LOGGER .warn ("Downloading {0} to {1} failed: {2}" .format (url , dst_path , err ))
440
440
441
441
def import_attachment (self , item , wordpress_namespace ):
442
+ # Download main image
442
443
url = get_text_tag (
443
444
item , '{{{0}}}attachment_url' .format (wordpress_namespace ), 'foo' )
444
445
link = get_text_tag (item , '{{{0}}}link' .format (wordpress_namespace ),
@@ -453,62 +454,130 @@ def import_attachment(self, item, wordpress_namespace):
453
454
links [link ] = '/' + dst_url
454
455
links [url ] = '/' + dst_url
455
456
456
- result = {}
457
- result ['files' ] = [path ] + self .download_additional_image_sizes (
458
- item ,
459
- wordpress_namespace ,
460
- os .path .dirname (url )
461
- )
462
- return result
463
-
464
- def download_additional_image_sizes (self , item , wordpress_namespace , source_path ):
465
- if phpserialize is None :
466
- return []
457
+ files = [path ]
458
+ files_meta = [{}]
467
459
468
460
additional_metadata = item .findall ('{{{0}}}postmeta' .format (wordpress_namespace ))
469
- if additional_metadata is None :
470
- return []
471
-
472
- result = []
473
- for element in additional_metadata :
474
- meta_key = element .find ('{{{0}}}meta_key' .format (wordpress_namespace ))
475
- if meta_key is not None and meta_key .text == '_wp_attachment_metadata' :
476
- meta_value = element .find ('{{{0}}}meta_value' .format (wordpress_namespace ))
461
+ if phpserialize and additional_metadata :
462
+ for element in additional_metadata :
463
+ meta_key = element .find ('{{{0}}}meta_key' .format (wordpress_namespace ))
464
+ if meta_key is not None and meta_key .text == '_wp_attachment_metadata' :
465
+ meta_value = element .find ('{{{0}}}meta_value' .format (wordpress_namespace ))
466
+
467
+ if meta_value is None :
468
+ continue
469
+
470
+ # Someone from Wordpress thought it was a good idea
471
+ # serialize PHP objects into that metadata field. Given
472
+ # that the export should give you the power to insert
473
+ # your blogging into another site or system its not.
474
+ # Why don't they just use JSON?
475
+ if sys .version_info [0 ] == 2 :
476
+ try :
477
+ metadata = phpserialize .loads (utils .sys_encode (meta_value .text ))
478
+ except ValueError :
479
+ # local encoding might be wrong sometimes
480
+ metadata = phpserialize .loads (meta_value .text .encode ('utf-8' ))
481
+ else :
482
+ metadata = phpserialize .loads (meta_value .text .encode ('utf-8' ))
477
483
478
- if meta_value is None :
479
- continue
484
+ meta_key = b'image_meta'
485
+ size_key = b'sizes'
486
+ file_key = b'file'
487
+ width_key = b'width'
488
+ height_key = b'height'
489
+
490
+ # Extract metadata
491
+ if width_key in metadata and height_key in metadata :
492
+ files_meta [0 ]['width' ] = int (metadata [width_key ])
493
+ files_meta [0 ]['height' ] = int (metadata [height_key ])
494
+
495
+ if meta_key in metadata :
496
+ image_meta = metadata [meta_key ]
497
+ dst_meta = {}
498
+
499
+ def add (our_key , wp_key , is_int = False , ignore_zero = False ):
500
+ if wp_key in image_meta :
501
+ value = image_meta [wp_key ]
502
+ if is_int :
503
+ value = int (value )
504
+ if ignore_zero and value == 0 :
505
+ return
506
+ else :
507
+ value = value .decode ('utf-8' ) # assume UTF-8
508
+ if value == '' : # skip empty values
509
+ return
510
+ dst_meta [our_key ] = value
511
+
512
+ add ('aperture' , b'aperture' , is_int = True , ignore_zero = True )
513
+ add ('credit' , b'credit' )
514
+ add ('camera' , b'camera' )
515
+ add ('caption' , b'caption' )
516
+ add ('created_timestamp' , b'created_timestamp' , is_int = True , ignore_zero = True )
517
+ add ('copyright' , b'copyright' )
518
+ add ('focal_length' , b'focal_length' , is_int = True , ignore_zero = True )
519
+ add ('iso' , b'iso' , is_int = True , ignore_zero = True )
520
+ add ('shutter_speed' , b'shutter_speed' , is_int = True , ignore_zero = True )
521
+ add ('title' , b'title' )
522
+
523
+ if len (dst_meta ) > 0 :
524
+ files_meta [0 ]['meta' ] = dst_meta
525
+
526
+ # Find other sizes of image
527
+ if size_key not in metadata :
528
+ continue
529
+
530
+ for size in metadata [size_key ]:
531
+ filename = metadata [size_key ][size ][file_key ]
532
+ url = '/' .join ([source_path , filename .decode ('utf-8' )])
533
+
534
+ # Construct metadata
535
+ meta = {}
536
+ meta ['size' ] = size .decode ('utf-8' )
537
+ if width_key in metadata [size_key ][size ] and height_key in metadata [size_key ][size ]:
538
+ meta ['width' ] = metadata [size_key ][size ][width_key ]
539
+ meta ['height' ] = metadata [size_key ][size ][height_key ]
540
+
541
+ path = urlparse (url ).path
542
+ dst_path = os .path .join (* ([self .output_folder , 'files' ] + list (path .split ('/' ))))
543
+ dst_dir = os .path .dirname (dst_path )
544
+ utils .makedirs (dst_dir )
545
+ LOGGER .info ("Downloading {0} => {1}" .format (url , dst_path ))
546
+ self .download_url_content_to_file (url , dst_path )
547
+ dst_url = '/' .join (dst_path .split (os .sep )[2 :])
548
+ links [url ] = '/' + dst_url
549
+
550
+ files .append (path )
551
+ files_meta .append (meta )
552
+
553
+ # Prepare result
554
+ result = {}
555
+ result ['files' ] = files
556
+ result ['files_meta' ] = files_meta
480
557
481
- # Someone from Wordpress thought it was a good idea
482
- # serialize PHP objects into that metadata field. Given
483
- # that the export should give you the power to insert
484
- # your blogging into another site or system its not.
485
- # Why don't they just use JSON?
486
- if sys .version_info [0 ] == 2 :
487
- try :
488
- metadata = phpserialize .loads (utils .sys_encode (meta_value .text ))
489
- except ValueError :
490
- # local encoding might be wrong sometimes
491
- metadata = phpserialize .loads (meta_value .text .encode ('utf-8' ))
492
- else :
493
- metadata = phpserialize .loads (meta_value .text .encode ('utf-8' ))
494
- size_key = b'sizes'
495
- file_key = b'file'
558
+ # Prepare extraction of more information
559
+ dc_namespace = item .nsmap ['dc' ]
560
+ content_namespace = item .nsmap ['content' ]
561
+ excerpt_namespace = item .nsmap ['excerpt' ]
496
562
497
- if size_key not in metadata :
498
- continue
563
+ def add (result_key , key , namespace = None , filter = None , store_empty = False ):
564
+ if namespace is not None :
565
+ value = get_text_tag (item , '{{{0}}}{1}' .format (namespace , key ), None )
566
+ else :
567
+ value = get_text_tag (item , key , None )
568
+ if value is not None :
569
+ if filter :
570
+ value = filter (value )
571
+ if value or store_empty :
572
+ result [result_key ] = value
573
+
574
+ add ('title' , 'title' )
575
+ add ('date_utc' , 'post_date_gmt' , namespace = wordpress_namespace )
576
+ add ('wordpress_user_name' , 'creator' , namespace = dc_namespace )
577
+ add ('content' , 'encoded' , namespace = content_namespace )
578
+ add ('excerpt' , 'encoded' , namespace = excerpt_namespace )
579
+ add ('description' , 'description' )
499
580
500
- for filename in [metadata [size_key ][size ][file_key ] for size in metadata [size_key ]]:
501
- url = '/' .join ([source_path , filename .decode ('utf-8' )])
502
-
503
- path = urlparse (url ).path
504
- dst_path = os .path .join (* ([self .output_folder , 'files' ] + list (path .split ('/' ))))
505
- dst_dir = os .path .dirname (dst_path )
506
- utils .makedirs (dst_dir )
507
- LOGGER .info ("Downloading {0} => {1}" .format (url , dst_path ))
508
- self .download_url_content_to_file (url , dst_path )
509
- dst_url = '/' .join (dst_path .split (os .sep )[2 :])
510
- links [url ] = '/' + dst_url
511
- result .append (path )
512
581
return result
513
582
514
583
code_re1 = re .compile (r'\[code.* lang.*?="(.*?)?".*\](.*?)\[/code\]' , re .DOTALL | re .MULTILINE )
0 commit comments