Updating batchaddmedia to use new metadata tools
This commit is contained in:
parent
e5e2cc2f16
commit
6fab7734d6
@ -15,7 +15,7 @@
|
|||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import copy, tempfile, tarfile, zipfile, subprocess, re, requests
|
import tempfile, tarfile, zipfile, subprocess, requests
|
||||||
from csv import reader as csv_reader
|
from csv import reader as csv_reader
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from pyld import jsonld
|
from pyld import jsonld
|
||||||
@ -24,11 +24,9 @@ from mediagoblin.gmg_commands import util as commands_util
|
|||||||
from mediagoblin.submit.lib import (
|
from mediagoblin.submit.lib import (
|
||||||
submit_media, get_upload_file_limits,
|
submit_media, get_upload_file_limits,
|
||||||
FileUploadLimit, UserUploadLimit, UserPastUploadLimit)
|
FileUploadLimit, UserUploadLimit, UserPastUploadLimit)
|
||||||
from mediagoblin.tools.translate import lazy_pass_to_ugettext as _
|
from mediagoblin.tools.metadata import compact_and_validate
|
||||||
|
|
||||||
from jsonschema import validate, FormatChecker, draft4_format_checker
|
|
||||||
from jsonschema.exceptions import ValidationError
|
from jsonschema.exceptions import ValidationError
|
||||||
from jsonschema.compat import str_types
|
|
||||||
|
|
||||||
|
|
||||||
def parser_setup(subparser):
|
def parser_setup(subparser):
|
||||||
@ -126,25 +124,24 @@ zip files and directories"
|
|||||||
contents = all_metadata.read()
|
contents = all_metadata.read()
|
||||||
media_metadata = parse_csv_file(contents)
|
media_metadata = parse_csv_file(contents)
|
||||||
|
|
||||||
metadata_context = { 'dcterms':'http://purl.org/dc/terms/',
|
|
||||||
'xsd': 'http://www.w3.org/2001/XMLSchema#'}
|
|
||||||
|
|
||||||
for media_id in media_locations.keys():
|
for media_id in media_locations.keys():
|
||||||
files_attempted += 1
|
files_attempted += 1
|
||||||
|
|
||||||
file_metadata = media_metadata[media_id]
|
file_metadata = media_metadata[media_id]
|
||||||
sanitized_metadata = check_metadata_format(file_metadata)
|
try:
|
||||||
if sanitized_metadata == {}: continue
|
json_ld_metadata = compact_and_validate(file_metadata)
|
||||||
|
except ValidationError, exc:
|
||||||
|
print "Error with '%s' value '%s': %s" % (
|
||||||
|
media_id, exc.path[0], exc.message)
|
||||||
|
continue
|
||||||
|
|
||||||
json_ld_metadata = jsonld.compact(file_metadata, metadata_context)
|
|
||||||
original_location = media_locations[media_id]['media:original']
|
original_location = media_locations[media_id]['media:original']
|
||||||
url = urlparse(original_location)
|
url = urlparse(original_location)
|
||||||
|
|
||||||
title = sanitized_metadata.get('dcterms:title')
|
title = json_ld_metadata.get('dcterms:title')
|
||||||
description = sanitized_metadata.get('dcterms:description')
|
description = json_ld_metadata.get('dcterms:description')
|
||||||
|
|
||||||
# TODO: this isn't the same thing
|
license = json_ld_metadata.get('license')
|
||||||
license = sanitized_metadata.get('dcterms:rights')
|
|
||||||
filename = url.path.split()[-1]
|
filename = url.path.split()[-1]
|
||||||
|
|
||||||
if url.scheme == 'http':
|
if url.scheme == 'http':
|
||||||
@ -214,75 +211,3 @@ def parse_csv_file(file_contents):
|
|||||||
def teardown(temp_files):
|
def teardown(temp_files):
|
||||||
for temp_file in temp_files:
|
for temp_file in temp_files:
|
||||||
subprocess.call(['rm','-r',temp_file])
|
subprocess.call(['rm','-r',temp_file])
|
||||||
|
|
||||||
|
|
||||||
## Set up the MediaGoblin checker
|
|
||||||
#
|
|
||||||
|
|
||||||
URL_REGEX = re.compile(
|
|
||||||
r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
|
|
||||||
re.IGNORECASE)
|
|
||||||
|
|
||||||
def is_uri(instance):
|
|
||||||
if not isinstance(instance, str_types):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return URL_REGEX.match(instance)
|
|
||||||
|
|
||||||
|
|
||||||
class DefaultChecker(FormatChecker):
|
|
||||||
checkers = copy.deepcopy(draft4_format_checker.checkers)
|
|
||||||
|
|
||||||
DefaultChecker.checkers[u"uri"] = (is_uri, ())
|
|
||||||
|
|
||||||
DEFAULT_CHECKER = DefaultChecker()
|
|
||||||
|
|
||||||
def check_metadata_format(metadata_dict):
|
|
||||||
schema = {
|
|
||||||
"$schema": "http://json-schema.org/schema#",
|
|
||||||
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"dcterms:rights": {
|
|
||||||
"format": "uri",
|
|
||||||
"type": "string",
|
|
||||||
},
|
|
||||||
"dcterms:created": {
|
|
||||||
|
|
||||||
}
|
|
||||||
},
|
|
||||||
# "required": ["dcterms:title", "media:id"],
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
validate(metadata_dict, schema,
|
|
||||||
format_checker=DEFAULT_CHECKER)
|
|
||||||
output_dict = metadata_dict
|
|
||||||
# "media:id" is only for internal use, so we delete it for the output
|
|
||||||
del output_dict['media:id']
|
|
||||||
|
|
||||||
except ValidationError, exc:
|
|
||||||
title = (metadata_dict.get('dcterms:title') or
|
|
||||||
metadata_dict.get('media:id') or _(u'UNKNOWN FILE'))
|
|
||||||
|
|
||||||
if exc.validator == "additionalProperties":
|
|
||||||
message = _(u'Invalid metadata provided for file "{title}". This \
|
|
||||||
script only accepts the Dublin Core metadata terms.'.format(title=title))
|
|
||||||
|
|
||||||
elif exc.validator == "required":
|
|
||||||
message = _(
|
|
||||||
u'All necessary metadata was not provided for file "{title}", you must include \
|
|
||||||
a "dcterms:title" column for each media file'.format(title=title))
|
|
||||||
|
|
||||||
else:
|
|
||||||
message = _(u'Could not find appropriate metadata for file \
|
|
||||||
"{title}".'.format(title=title))
|
|
||||||
|
|
||||||
print _(u"""WARN: {message} \nSkipping File...\n""".format(
|
|
||||||
message=message))
|
|
||||||
|
|
||||||
output_dict = {}
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
|
|
||||||
return output_dict
|
|
||||||
|
@ -78,7 +78,7 @@ DEFAULT_SCHEMA = {
|
|||||||
|
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"dcterms:rights": {
|
"license": {
|
||||||
"format": "uri",
|
"format": "uri",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
},
|
},
|
||||||
@ -96,7 +96,7 @@ def compact_and_validate(metadata, context=MEDIAGOBLIN_CONTEXT,
|
|||||||
compact json with supplied context, check against schema for errors
|
compact json with supplied context, check against schema for errors
|
||||||
|
|
||||||
raises an exception (jsonschema.exceptions.ValidationError) if
|
raises an exception (jsonschema.exceptions.ValidationError) if
|
||||||
there's an error.
|
there's an error.9
|
||||||
|
|
||||||
You may wish to do this validation yourself... this is just for convenience.
|
You may wish to do this validation yourself... this is just for convenience.
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user