load-from-spip.py 16 KB
Newer Older
1
from datetime import datetime
2
import time
3
4
import gzip
import xml.etree.ElementTree as ET
5
import os
fred's avatar
fred committed
6
import re
7
import urllib2
8

9
from django.core.files import File
10
from django.core.management.base import BaseCommand, CommandError
fred's avatar
fred committed
11
from django.utils.text import slugify
12

fred's avatar
fred committed
13
from ...models import Emission, Episode, Diffusion, Category, SoundFile, NewsItem
14
15


16
17
18
from _spip2html import makeHtmlFromSpip


19
class Rubric(object):
20
21
    archived = False

22
23
24
    def __init__(self):
        self.articles = {}
        self.rubrics = {}
25
        self.categories = []
26
27
28
29
30
31


class Article(object):
    pass


fred's avatar
fred committed
32
33
34
35
class Breve(object):
    pass


36
37
38
39
40
41
42
43
44
class KeywordGroup(object):
    def __init__(self):
        self.keywords = {}


class Keyword(object):
    related_object = None


45
46
47
48
class Document(object):
    pass


49
50
51
52
53
54
55
56
57
58
59
60
class Command(BaseCommand):
    args = 'filename'
    help = 'Load emissions and episodes from a Spip dump file'

    def handle(self, filename, *args, **options):
        with open(filename) as fd:
            content = fd.read()
            # the spip_courriers parts of the spip export are not properly
            # encoded, we manually remove them here so the XML file can be
            # parsed correctly.
            content = content[:content.find('<spip_courriers>')] + \
                      content[content.rfind('</spip_courriers>')+17:]
61
            self.root = ET.fromstring(content)
62

63
64
65
            self.load_keyword_groups()
            self.load_keywords()
            self.load_rubrics()
66
67
68

            emission_rubric_ids = []
            straight_emission_rubric_ids = []
69
            for rubric in self.rubrics['2'].rubrics.values(): # 'Les emissions'
70
71
72
73
74
                emission_rubric_ids.append(rubric.id_rubrique)
                straight_emission_rubric_ids.append(rubric.id_rubrique)
                for subrubric in rubric.rubrics.values():
                    emission_rubric_ids.append(subrubric.id_rubrique)

75
76
            self.load_breves()
            self.load_articles(emission_rubric_ids)
77

78
            self.set_urls()
79

80
81
            self.load_documents()
            self.load_document_links()
82

83
84
            self.process_emission_keywords()
            self.process_episode_keywords()
85

86
87
88
89
90
91
92
            for emission_id in straight_emission_rubric_ids:
                rubric = self.rubrics[emission_id]
                emission = self.get_or_create_emission(rubric)

                for article in rubric.articles.values():
                    episode = self.get_or_create_episode(article, emission)
                    if episode is None:
93
                        continue
94

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
                    self.set_sound_files(article, episode)

            for breve in self.breves.values():
                newsitem = self.get_or_create_newsitem(breve)

    def load_keyword_groups(self):
        self.keyword_groups = {}
        for keywordgroup_xml in self.root.iter('spip_groupes_mots'):
            keyword_group = KeywordGroup()
            for attr in ('id_groupe', 'titre'):
                setattr(keyword_group, attr, keywordgroup_xml.find(attr).text)
            if keyword_group.id_groupe not in ('11', # archives
                                               '12', # subjects
                                               '3',  # category
                                               '10', # transversal
                                               ):
                continue
            self.keyword_groups[keyword_group.id_groupe] = keyword_group

    def load_keywords(self):
        self.keywords = {}
        for keyword_xml in self.root.iter('spip_mots'):
            keyword = Keyword()
            for attr in ('id_mot', 'titre', 'id_groupe'):
                setattr(keyword, attr, keyword_xml.find(attr).text)
            if not keyword.id_groupe in self.keyword_groups:
                continue
            if keyword.id_mot in ('92',): # blacklist
                continue
            self.keywords[keyword.id_mot] = keyword
            self.keyword_groups[keyword.id_groupe] = keyword

    def load_rubrics(self):
        self.rubrics = {}
        for rubric_xml in self.root.iter('spip_rubriques'):
            rubric = Rubric()
            for attr in ('id_rubrique', 'id_parent', 'titre',
                         'descriptif', 'texte'):
                setattr(rubric, attr, rubric_xml.find(attr).text)
            self.rubrics[rubric.id_rubrique] = rubric

        for rubric in self.rubrics.values():
            if rubric.id_parent and rubric.id_parent != '0':
                self.rubrics[rubric.id_parent].rubrics[rubric.id_rubrique] = rubric

    def load_breves(self):
        self.breves = {}
        for breve_xml in self.root.iter('spip_breves'):
            breve = Breve()
            for attr in ('id_breve', 'titre', 'texte',
                         'date_heure', 'statut'):
                setattr(breve, attr, breve_xml.find(attr).text)
            if breve.statut != 'publie':
                continue
            self.breves[breve.id_breve] = breve

    def load_articles(self, emission_rubric_ids):
        self.articles = {}
        for article_xml in self.root.iter('spip_articles'):
            if article_xml.find('id_rubrique').text == '65':
                pass # rubric for events, handle with care
            elif not article_xml.find('id_rubrique').text in emission_rubric_ids:
                continue
            article = Article()
            for attr in ('id_rubrique', 'id_article', 'titre', 'surtitre',
                         'soustitre', 'descriptif', 'chapo', 'texte',
                         'date_redac', 'statut', 'date'):
                setattr(article, attr, article_xml.find(attr).text)

            if article.id_rubrique == '65':
                # this is an event, they get a special handling, to be
                # merged with newsitems
                if article.statut not in ('publie', 'prop'):
168
                    continue
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
                breve = Breve()
                breve.id_breve = '0%s' % article.id_article
                breve.titre = article.titre
                breve.texte = article.texte
                breve.date_heure = article.date
                self.breves[breve.id_breve] = breve
                continue

            if article.statut != 'publie':
                continue
            article.mots_cles = []
            self.articles[article.id_article] = article

            if self.rubrics[article.id_rubrique].id_parent != '2':
                # the spip structure didn't really expect subrubrics in the
                # 'emissions' section, but people added some nevertheless,
                # move related articles to their parent rubric.
                article.id_rubrique = self.rubrics[article.id_rubrique].id_parent

            self.rubrics[article.id_rubrique].articles[article.id_article] = article

    def set_urls(self):
        for spip_url_xml in self.root.iter('spip_urls'):
            id_objet = spip_url_xml.find('id_objet').text
            url = spip_url_xml.find('url').text
            if spip_url_xml.find('type').text == 'article' and id_objet in self.articles:
                self.articles[id_objet].url = url
            elif spip_url_xml.find('type').text == 'article' and ('0%s' % id_objet) in self.breves:
                self.breves['0' + id_objet].url = url
            elif spip_url_xml.find('type').text == 'rubrique' and id_objet in self.rubrics:
                self.rubrics[id_objet].url = url
            elif spip_url_xml.find('type').text == 'mot' and id_objet in self.keywords:
                self.keywords[id_objet].url = url
            elif spip_url_xml.find('type').text == 'breve' and id_objet in self.breves:
                self.breves[id_objet].url = url

    def load_documents(self):
        self.documents = {}
        for spip_doc_xml in self.root.iter('spip_documents'):
            id_document = spip_doc_xml.find('id_document').text
            doc = Document()
            doc.filename = spip_doc_xml.find('fichier').text
            doc.title = spip_doc_xml.find('titre').text
            if spip_doc_xml.find('distant').text == 'oui':
                url = doc.filename
                doc.filename = os.path.split(url)[-1]
                filename = os.path.join('media/IMG/', doc.filename)
                if not os.path.exists('media/IMG'):
217
                    continue
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
                if not os.path.exists(filename):
                    fd = file(filename, 'w')
                    fd.write(urllib2.urlopen(url).read())
                    fd.close()
            self.documents[id_document] = doc

    def load_document_links(self):
        self.attached_documents = {}
        for spip_doc_liens_xml in self.root.iter('spip_documents_liens'):
            id_document = spip_doc_liens_xml.find('id_document').text
            id_object = spip_doc_liens_xml.find('id_objet').text
            if spip_doc_liens_xml.find('objet').text != 'article':
                continue
            if not self.attached_documents.get(id_object):
                self.attached_documents[id_object] = []
            self.attached_documents[id_object].append(self.documents.get(id_document))

    def process_emission_keywords(self):
        for rubrickeyword_xml in self.root.iter('spip_mots_rubriques'):
            keyword_id = rubrickeyword_xml.find('id_mot').text
            rubric_id = rubrickeyword_xml.find('id_rubrique').text
            rubric = self.rubrics.get(rubric_id)
            if not rubric:
                continue

            if keyword_id == '100': # archive
                rubric.archived = True
                continue

            keyword = self.keywords.get(keyword_id)
            if keyword is None:
                continue

            if keyword.id_groupe == '3': # category
                rubric.categories.append(keyword)
                if not keyword.related_object:
                    cs = Category.objects.filter(title=keyword.titre)
                    if len(cs):
                        c = cs[0]
                    else:
                        c = Category()
                    c.title = keyword.titre
                    c.save()
                    keyword.related_object = c

    def process_episode_keywords(self):
        for articlekeyword_xml in self.root.iter('spip_mots_articles'):
            keyword_id = articlekeyword_xml.find('id_mot').text
            article_id = articlekeyword_xml.find('id_article').text
            article = self.articles.get(article_id)
            if not article:
                continue

            keyword = self.keywords.get(keyword_id)
            if keyword is None:
                continue

            if keyword.id_groupe in ('10', '12'): # transversales & sujets
                article.mots_cles.append(keyword.titre)

    def get_or_create_emission(self, rubric):
        slug = rubric.url.lower()
        try:
            emission = Emission.objects.get(slug=slug)
        except Emission.DoesNotExist:
            slug = slug.split(',')[0]
            try:
                emission = Emission.objects.get(slug=slug)
            except Emission.DoesNotExist:
                emission = Emission()
        slug = slug.split(',')[0]
        emission.slug = slug
        emission.title = rubric.titre
        emission.archived = rubric.archived
        emission.description = makeHtmlFromSpip(rubric.descriptif,
                        documents=self.documents) or None
        emission.text = makeHtmlFromSpip(rubric.texte,
                        documents=self.documents) or None

        image_path = None
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)):
                image_path = ['media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)]
                break
        else:
            if emission.text:
                image_path = re.findall('src="/(media/IMG.*?)"', emission.text, re.DOTALL)
            elif emission.description:
                image_path = re.findall('src="/(media/IMG.*?)"', emission.description, re.DOTALL)

        self.set_image(emission, image_path)

        emission.save()
        emission.categories.clear()
        for category in rubric.categories:
            emission.categories.add(category.related_object)
        emission.save()
        return emission

    def get_or_create_episode(self, article, emission):
        if article.date_redac == '0000-00-00 00:00:00':
            # date_redac was used for the diffusion date, if it's
            # not set it's probably not really an episode
            return None

        slug = article.url.lower()
        if slug.startswith('nouvel-article'):
            # <sigh/>
            slug = slugify(unicode(article.titre))
        try:
            episode = Episode.objects.get(slug=slug)
        except Episode.DoesNotExist:
            episode = Episode()
            episode.slug = slug
        episode.emission = emission
        episode.title = article.titre
        episode.description = makeHtmlFromSpip(article.descriptif,
                        documents=self.documents) or None
        episode.text = makeHtmlFromSpip(article.texte,
                        documents=self.documents) or None

        image_path = None
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/arton%s%s' % (article.id_article, ext)):
                image_path = ['media/IMG/arton%s%s' % (article.id_article, ext)]
                break
        else:
            if episode.text:
                image_path = re.findall('src="/(media/IMG.*?)"', episode.text, re.DOTALL)
            elif episode.description:
                image_path = re.findall('src="/(media/IMG.*?)"', episode.description, re.DOTALL)

        self.set_image(episode, image_path)

        for motcle in article.mots_cles:
            episode.tags.add(motcle.lower())

        episode.save()

        if not Diffusion.objects.filter(episode=episode).count():
            diffusion = Diffusion()
            diffusion.episode = episode
            try:
                diffusion.datetime = datetime.strptime(article.date_redac, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                pass
            else:
                diffusion.save()

        return episode

    def set_sound_files(self, article, episode):
        if SoundFile.objects.filter(episode=episode).count():
            return # skip episodes that already have sound files
        episode_files = self.attached_documents.get(article.id_article)
        if episode_files:
            for episode_file in episode_files:
                if episode_file is None:
fred's avatar
fred committed
376
                    continue
377
                if os.path.splitext(episode_file.filename)[-1] not in ('.ogg', '.mp3'):
fred's avatar
fred committed
378
                    continue
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
                if not os.path.exists('media/IMG/' + episode_file.filename):
                    continue
                soundfile = SoundFile()
                soundfile.episode = episode
                soundfile.podcastable = True
                soundfile.fragment = False
                soundfile.title = episode_file.title or '[pas de titre]'
                soundfile.file = File(file('media/IMG/' + episode_file.filename))
                soundfile.save()

    def get_or_create_newsitem(self, breve):
        slug = breve.url.lower()
        try:
            newsitem = NewsItem.objects.get(slug=slug)
        except NewsItem.DoesNotExist:
            newsitem = NewsItem()
        newsitem.title = breve.titre
        newsitem.slug = slug
        newsitem.text = makeHtmlFromSpip(breve.texte,
                documents=self.documents) or None
        newsitem.datetime = datetime.strptime(breve.date_heure, '%Y-%m-%d %H:%M:%S')
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/breveon%s%s' % (breve.id_breve, ext)):
                image_path = ['media/IMG/breveon%s%s' % (breve.id_breve, ext)]
                break
        else:
            image_path = re.findall('src="/(media/IMG.*?)"', newsitem.text, re.DOTALL)
        self.set_image(newsitem, image_path)
        newsitem.save()
408
409
410
411
412
413
414
415

    def set_image(self, object, image_path):
        if not image_path:
            return
        image_path = image_path[0]
        if object.image and os.path.basename(object.image.path) == os.path.basename(image_path):
            return
        object.image = File(file(image_path))