Commit d79604db authored by fred's avatar fred
Browse files

refactored load-from-spip code into smaller methods

parent a650993b
......@@ -51,13 +51,6 @@ class Command(BaseCommand):
help = 'Load emissions and episodes from a Spip dump file'
def handle(self, filename, *args, **options):
rubrics = {}
articles = {}
keyword_groups = {}
keywords = {}
documents = {}
breves = {}
with open(filename) as fd:
content = fd.read()
# the spip_courriers parts of the spip export are not properly
......@@ -65,302 +58,353 @@ class Command(BaseCommand):
# parsed correctly.
content = content[:content.find('<spip_courriers>')] + \
content[content.rfind('</spip_courriers>')+17:]
root = ET.fromstring(content)
for keywordgroup_xml in root.iter('spip_groupes_mots'):
keyword_group = KeywordGroup()
for attr in ('id_groupe', 'titre'):
setattr(keyword_group, attr, keywordgroup_xml.find(attr).text)
if keyword_group.id_groupe not in ('11', # archives
'12', # subjects
'3', # category
'10', # transversal
):
continue
keyword_groups[keyword_group.id_groupe] = keyword_group
self.root = ET.fromstring(content)
for keyword_xml in root.iter('spip_mots'):
keyword = Keyword()
for attr in ('id_mot', 'titre', 'id_groupe'):
setattr(keyword, attr, keyword_xml.find(attr).text)
if not keyword.id_groupe in keyword_groups:
continue
if keyword.id_mot in ('92',): # blacklist
continue
keywords[keyword.id_mot] = keyword
keyword_groups[keyword.id_groupe] = keyword
for rubric_xml in root.iter('spip_rubriques'):
rubric = Rubric()
for attr in ('id_rubrique', 'id_parent', 'titre',
'descriptif', 'texte'):
setattr(rubric, attr, rubric_xml.find(attr).text)
rubrics[rubric.id_rubrique] = rubric
for rubric in rubrics.values():
if rubric.id_parent and rubric.id_parent != '0':
rubrics[rubric.id_parent].rubrics[rubric.id_rubrique] = rubric
self.load_keyword_groups()
self.load_keywords()
self.load_rubrics()
emission_rubric_ids = []
straight_emission_rubric_ids = []
for rubric in rubrics['2'].rubrics.values(): # 'Les emissions'
for rubric in self.rubrics['2'].rubrics.values(): # 'Les emissions'
emission_rubric_ids.append(rubric.id_rubrique)
straight_emission_rubric_ids.append(rubric.id_rubrique)
for subrubric in rubric.rubrics.values():
emission_rubric_ids.append(subrubric.id_rubrique)
for article_xml in root.iter('spip_articles'):
if article_xml.find('id_rubrique').text == '65':
pass # rubric for events, handle with care
elif not article_xml.find('id_rubrique').text in emission_rubric_ids:
continue
article = Article()
for attr in ('id_rubrique', 'id_article', 'titre', 'surtitre',
'soustitre', 'descriptif', 'chapo', 'texte',
'date_redac', 'statut', 'date'):
setattr(article, attr, article_xml.find(attr).text)
if article.id_rubrique == '65':
# this is an event, they get a special handling, to be
# merged with newsitems
if article.statut not in ('publie', 'prop'):
continue
breve = Breve()
breve.id_breve = '0%s' % article.id_article
breve.titre = article.titre
breve.texte = article.texte
breve.date_heure = article.date
breves[breve.id_breve] = breve
continue
self.load_breves()
self.load_articles(emission_rubric_ids)
if article.statut != 'publie':
continue
article.mots_cles = []
articles[article.id_article] = article
self.set_urls()
if rubrics[article.id_rubrique].id_parent != '2':
# the spip structure didn't really expect subrubrics in the
# 'emissions' section, but people added some nevertheless,
# move related articles to their parent rubric.
article.id_rubrique = rubrics[article.id_rubrique].id_parent
self.load_documents()
self.load_document_links()
rubrics[article.id_rubrique].articles[article.id_article] = article
self.process_emission_keywords()
self.process_episode_keywords()
for breve_xml in root.iter('spip_breves'):
breve = Breve()
for attr in ('id_breve', 'titre', 'texte',
'date_heure', 'statut'):
setattr(breve, attr, breve_xml.find(attr).text)
if breve.statut != 'publie':
continue
breves[breve.id_breve] = breve
for spip_url_xml in root.iter('spip_urls'):
id_objet = spip_url_xml.find('id_objet').text
url = spip_url_xml.find('url').text
if spip_url_xml.find('type').text == 'article' and id_objet in articles:
articles[id_objet].url = url
elif spip_url_xml.find('type').text == 'article' and ('0%s' % id_objet) in breves:
breves['0' + id_objet].url = url
elif spip_url_xml.find('type').text == 'rubrique' and id_objet in rubrics:
rubrics[id_objet].url = url
elif spip_url_xml.find('type').text == 'mot' and id_objet in keywords:
keywords[id_objet].url = url
elif spip_url_xml.find('type').text == 'breve' and id_objet in breves:
breves[id_objet].url = url
for spip_doc_xml in root.iter('spip_documents'):
id_document = spip_doc_xml.find('id_document').text
doc = Document()
doc.filename = spip_doc_xml.find('fichier').text
doc.title = spip_doc_xml.find('titre').text
if spip_doc_xml.find('distant').text == 'oui':
url = doc.filename
doc.filename = os.path.split(url)[-1]
filename = os.path.join('media/IMG/', doc.filename)
if not os.path.exists('media/IMG'):
for emission_id in straight_emission_rubric_ids:
rubric = self.rubrics[emission_id]
emission = self.get_or_create_emission(rubric)
for article in rubric.articles.values():
episode = self.get_or_create_episode(article, emission)
if episode is None:
continue
if not os.path.exists(filename):
fd = file(filename, 'w')
fd.write(urllib2.urlopen(url).read())
fd.close()
documents[id_document] = doc
attached_documents = {}
for spip_doc_liens_xml in root.iter('spip_documents_liens'):
id_document = spip_doc_liens_xml.find('id_document').text
id_object = spip_doc_liens_xml.find('id_objet').text
if spip_doc_liens_xml.find('objet').text != 'article':
continue
if not attached_documents.get(id_object):
attached_documents[id_object] = []
attached_documents[id_object].append(documents.get(id_document))
for rubrickeyword_xml in root.iter('spip_mots_rubriques'):
keyword_id = rubrickeyword_xml.find('id_mot').text
rubric_id = rubrickeyword_xml.find('id_rubrique').text
rubric = rubrics.get(rubric_id)
if not rubric:
continue
if keyword_id == '100': # archive
rubric.archived = True
self.set_sound_files(article, episode)
for breve in self.breves.values():
newsitem = self.get_or_create_newsitem(breve)
def load_keyword_groups(self):
self.keyword_groups = {}
for keywordgroup_xml in self.root.iter('spip_groupes_mots'):
keyword_group = KeywordGroup()
for attr in ('id_groupe', 'titre'):
setattr(keyword_group, attr, keywordgroup_xml.find(attr).text)
if keyword_group.id_groupe not in ('11', # archives
'12', # subjects
'3', # category
'10', # transversal
):
continue
self.keyword_groups[keyword_group.id_groupe] = keyword_group
def load_keywords(self):
self.keywords = {}
for keyword_xml in self.root.iter('spip_mots'):
keyword = Keyword()
for attr in ('id_mot', 'titre', 'id_groupe'):
setattr(keyword, attr, keyword_xml.find(attr).text)
if not keyword.id_groupe in self.keyword_groups:
continue
if keyword.id_mot in ('92',): # blacklist
continue
self.keywords[keyword.id_mot] = keyword
self.keyword_groups[keyword.id_groupe] = keyword
def load_rubrics(self):
self.rubrics = {}
for rubric_xml in self.root.iter('spip_rubriques'):
rubric = Rubric()
for attr in ('id_rubrique', 'id_parent', 'titre',
'descriptif', 'texte'):
setattr(rubric, attr, rubric_xml.find(attr).text)
self.rubrics[rubric.id_rubrique] = rubric
for rubric in self.rubrics.values():
if rubric.id_parent and rubric.id_parent != '0':
self.rubrics[rubric.id_parent].rubrics[rubric.id_rubrique] = rubric
def load_breves(self):
self.breves = {}
for breve_xml in self.root.iter('spip_breves'):
breve = Breve()
for attr in ('id_breve', 'titre', 'texte',
'date_heure', 'statut'):
setattr(breve, attr, breve_xml.find(attr).text)
if breve.statut != 'publie':
continue
self.breves[breve.id_breve] = breve
def load_articles(self, emission_rubric_ids):
self.articles = {}
for article_xml in self.root.iter('spip_articles'):
if article_xml.find('id_rubrique').text == '65':
pass # rubric for events, handle with care
elif not article_xml.find('id_rubrique').text in emission_rubric_ids:
continue
article = Article()
for attr in ('id_rubrique', 'id_article', 'titre', 'surtitre',
'soustitre', 'descriptif', 'chapo', 'texte',
'date_redac', 'statut', 'date'):
setattr(article, attr, article_xml.find(attr).text)
if article.id_rubrique == '65':
# this is an event, they get a special handling, to be
# merged with newsitems
if article.statut not in ('publie', 'prop'):
continue
keyword = keywords.get(keyword_id)
if keyword is None:
breve = Breve()
breve.id_breve = '0%s' % article.id_article
breve.titre = article.titre
breve.texte = article.texte
breve.date_heure = article.date
self.breves[breve.id_breve] = breve
continue
if article.statut != 'publie':
continue
article.mots_cles = []
self.articles[article.id_article] = article
if self.rubrics[article.id_rubrique].id_parent != '2':
# the spip structure didn't really expect subrubrics in the
# 'emissions' section, but people added some nevertheless,
# move related articles to their parent rubric.
article.id_rubrique = self.rubrics[article.id_rubrique].id_parent
self.rubrics[article.id_rubrique].articles[article.id_article] = article
def set_urls(self):
for spip_url_xml in self.root.iter('spip_urls'):
id_objet = spip_url_xml.find('id_objet').text
url = spip_url_xml.find('url').text
if spip_url_xml.find('type').text == 'article' and id_objet in self.articles:
self.articles[id_objet].url = url
elif spip_url_xml.find('type').text == 'article' and ('0%s' % id_objet) in self.breves:
self.breves['0' + id_objet].url = url
elif spip_url_xml.find('type').text == 'rubrique' and id_objet in self.rubrics:
self.rubrics[id_objet].url = url
elif spip_url_xml.find('type').text == 'mot' and id_objet in self.keywords:
self.keywords[id_objet].url = url
elif spip_url_xml.find('type').text == 'breve' and id_objet in self.breves:
self.breves[id_objet].url = url
def load_documents(self):
self.documents = {}
for spip_doc_xml in self.root.iter('spip_documents'):
id_document = spip_doc_xml.find('id_document').text
doc = Document()
doc.filename = spip_doc_xml.find('fichier').text
doc.title = spip_doc_xml.find('titre').text
if spip_doc_xml.find('distant').text == 'oui':
url = doc.filename
doc.filename = os.path.split(url)[-1]
filename = os.path.join('media/IMG/', doc.filename)
if not os.path.exists('media/IMG'):
continue
if keyword.id_groupe == '3': # category
rubric.categories.append(keyword)
if not keyword.related_object:
cs = Category.objects.filter(title=keyword.titre)
if len(cs):
c = cs[0]
else:
c = Category()
c.title = keyword.titre
c.save()
keyword.related_object = c
for articlekeyword_xml in root.iter('spip_mots_articles'):
keyword_id = articlekeyword_xml.find('id_mot').text
article_id = articlekeyword_xml.find('id_article').text
article = articles.get(article_id)
if not article:
if not os.path.exists(filename):
fd = file(filename, 'w')
fd.write(urllib2.urlopen(url).read())
fd.close()
self.documents[id_document] = doc
def load_document_links(self):
self.attached_documents = {}
for spip_doc_liens_xml in self.root.iter('spip_documents_liens'):
id_document = spip_doc_liens_xml.find('id_document').text
id_object = spip_doc_liens_xml.find('id_objet').text
if spip_doc_liens_xml.find('objet').text != 'article':
continue
if not self.attached_documents.get(id_object):
self.attached_documents[id_object] = []
self.attached_documents[id_object].append(self.documents.get(id_document))
def process_emission_keywords(self):
for rubrickeyword_xml in self.root.iter('spip_mots_rubriques'):
keyword_id = rubrickeyword_xml.find('id_mot').text
rubric_id = rubrickeyword_xml.find('id_rubrique').text
rubric = self.rubrics.get(rubric_id)
if not rubric:
continue
if keyword_id == '100': # archive
rubric.archived = True
continue
keyword = self.keywords.get(keyword_id)
if keyword is None:
continue
if keyword.id_groupe == '3': # category
rubric.categories.append(keyword)
if not keyword.related_object:
cs = Category.objects.filter(title=keyword.titre)
if len(cs):
c = cs[0]
else:
c = Category()
c.title = keyword.titre
c.save()
keyword.related_object = c
def process_episode_keywords(self):
for articlekeyword_xml in self.root.iter('spip_mots_articles'):
keyword_id = articlekeyword_xml.find('id_mot').text
article_id = articlekeyword_xml.find('id_article').text
article = self.articles.get(article_id)
if not article:
continue
keyword = self.keywords.get(keyword_id)
if keyword is None:
continue
if keyword.id_groupe in ('10', '12'): # transversales & sujets
article.mots_cles.append(keyword.titre)
def get_or_create_emission(self, rubric):
slug = rubric.url.lower()
try:
emission = Emission.objects.get(slug=slug)
except Emission.DoesNotExist:
slug = slug.split(',')[0]
try:
emission = Emission.objects.get(slug=slug)
except Emission.DoesNotExist:
emission = Emission()
slug = slug.split(',')[0]
emission.slug = slug
emission.title = rubric.titre
emission.archived = rubric.archived
emission.description = makeHtmlFromSpip(rubric.descriptif,
documents=self.documents) or None
emission.text = makeHtmlFromSpip(rubric.texte,
documents=self.documents) or None
image_path = None
for ext in ('.jpg', '.png', '.gif'):
if os.path.exists('media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)):
image_path = ['media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)]
break
else:
if emission.text:
image_path = re.findall('src="/(media/IMG.*?)"', emission.text, re.DOTALL)
elif emission.description:
image_path = re.findall('src="/(media/IMG.*?)"', emission.description, re.DOTALL)
self.set_image(emission, image_path)
emission.save()
emission.categories.clear()
for category in rubric.categories:
emission.categories.add(category.related_object)
emission.save()
return emission
def get_or_create_episode(self, article, emission):
if article.date_redac == '0000-00-00 00:00:00':
# date_redac was used for the diffusion date, if it's
# not set it's probably not really an episode
return None
slug = article.url.lower()
if slug.startswith('nouvel-article'):
# <sigh/>
slug = slugify(unicode(article.titre))
try:
episode = Episode.objects.get(slug=slug)
except Episode.DoesNotExist:
episode = Episode()
episode.slug = slug
episode.emission = emission
episode.title = article.titre
episode.description = makeHtmlFromSpip(article.descriptif,
documents=self.documents) or None
episode.text = makeHtmlFromSpip(article.texte,
documents=self.documents) or None
image_path = None
for ext in ('.jpg', '.png', '.gif'):
if os.path.exists('media/IMG/arton%s%s' % (article.id_article, ext)):
image_path = ['media/IMG/arton%s%s' % (article.id_article, ext)]
break
else:
if episode.text:
image_path = re.findall('src="/(media/IMG.*?)"', episode.text, re.DOTALL)
elif episode.description:
image_path = re.findall('src="/(media/IMG.*?)"', episode.description, re.DOTALL)
self.set_image(episode, image_path)
for motcle in article.mots_cles:
episode.tags.add(motcle.lower())
episode.save()
if not Diffusion.objects.filter(episode=episode).count():
diffusion = Diffusion()
diffusion.episode = episode
try:
diffusion.datetime = datetime.strptime(article.date_redac, '%Y-%m-%d %H:%M:%S')
except ValueError:
pass
else:
diffusion.save()
return episode
def set_sound_files(self, article, episode):
if SoundFile.objects.filter(episode=episode).count():
return # skip episodes that already have sound files
episode_files = self.attached_documents.get(article.id_article)
if episode_files:
for episode_file in episode_files:
if episode_file is None:
continue
keyword = keywords.get(keyword_id)
if keyword is None:
if os.path.splitext(episode_file.filename)[-1] not in ('.ogg', '.mp3'):
continue
if keyword.id_groupe in ('10', '12'): # transversales & sujets
article.mots_cles.append(keyword.titre)
for emission_id in straight_emission_rubric_ids:
rubric = rubrics[emission_id]
slug = rubric.url.lower()
try:
emission = Emission.objects.get(slug=slug)
except Emission.DoesNotExist:
slug = slug.split(',')[0]
try:
emission = Emission.objects.get(slug=slug)
except Emission.DoesNotExist:
emission = Emission()
slug = slug.split(',')[0]
emission.slug = slug
emission.title = rubric.titre
emission.archived = rubric.archived
emission.description = makeHtmlFromSpip(rubric.descriptif,
documents=documents) or None
emission.text = makeHtmlFromSpip(rubric.texte,
documents=documents) or None
image_path = None
for ext in ('.jpg', '.png', '.gif'):
if os.path.exists('media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)):
image_path = ['media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)]
break
else:
if emission.text:
image_path = re.findall('src="/(media/IMG.*?)"', emission.text, re.DOTALL)
elif emission.description:
image_path = re.findall('src="/(media/IMG.*?)"', emission.description, re.DOTALL)
self.set_image(emission, image_path)
emission.save()
emission.categories.clear()
for category in rubric.categories:
emission.categories.add(category.related_object)
emission.save()
for article in rubric.articles.values():
if article.date_redac == '0000-00-00 00:00:00':
# date_redac was used for the diffusion date, if it's
# not set it's probably not really an episode
continue
slug = article.url.lower()
if slug.startswith('nouvel-article'):
# <sigh/>
slug = slugify(article.title)
try:
episode = Episode.objects.get(slug=slug)
except Episode.DoesNotExist:
episode = Episode()
episode.slug = slug
episode.emission = emission
episode.title = article.titre
episode.description = makeHtmlFromSpip(article.descriptif,
documents=documents) or None
episode.text = makeHtmlFromSpip(article.texte,
documents=documents) or None
image_path = None
for ext in ('.jpg', '.png', '.gif'):
if os.path.exists('media/IMG/arton%s%s' % (article.id_article, ext)):
image_path = ['media/IMG/arton%s%s' % (article.id_article, ext)]
break
else:
if episode.text:
image_path = re.findall('src="/(media/IMG.*?)"', episode.text, re.DOTALL)
elif episode.description:
image_path = re.findall('src="/(media/IMG.*?)"', episode.description, re.DOTALL)
self.set_image(episode, image_path)
for motcle in article.mots_cles:
episode.tags.add(motcle.lower())
episode.save()
if not Diffusion.objects.filter(episode=episode).count():
diffusion = Diffusion()
diffusion.episode = episode
try:
diffusion.datetime = datetime.strptime(article.date_redac, '%Y-%m-%d %H:%M:%S')
except ValueError:
pass