load-from-spip.py 20.7 KB
Newer Older
1
2
# -*- coding: utf-8 -*-

3
from datetime import datetime
4
import time
5
6
import gzip
import xml.etree.ElementTree as ET
7
import os
fred's avatar
fred committed
8
import re
9
import urllib2
10

fred's avatar
fred committed
11
12
from optparse import make_option

13
from django.conf import settings
14
from django.core.files import File
15
from django.core.management.base import BaseCommand, CommandError
16
from django.core.urlresolvers import reverse
fred's avatar
fred committed
17
from django.utils.text import slugify
18

fred's avatar
fred committed
19
from ...models import Emission, Episode, Diffusion, Category, SoundFile, NewsItem
20
from ...models import get_sound_path
21
22


23
24
25
from _spip2html import makeHtmlFromSpip


26
class Rubric(object):
27
28
    archived = False

29
30
31
    spip_url_marker = '-'
    spip_url_object_name = 'rubrique'

32
33
34
    def __init__(self):
        self.articles = {}
        self.rubrics = {}
35
        self.categories = []
36

37
38
39
40
    @property
    def id(self):
        return self.id_rubrique

41
42

class Article(object):
43
44
45
46
47
48
    spip_url_marker = ''
    spip_url_object_name = 'article'

    @property
    def id(self):
        return self.id_article
49
50


fred's avatar
fred committed
51
class Breve(object):
52
53
54
55
56
57
    spip_url_marker = '+'
    spip_url_object_name = 'breve'

    @property
    def id(self):
        return self.id_breve
fred's avatar
fred committed
58
59


60
61
62
63
64
65
66
67
68
class KeywordGroup(object):
    def __init__(self):
        self.keywords = {}


class Keyword(object):
    related_object = None


69
70
71
72
class Document(object):
    pass


73
74
75
76
class Command(BaseCommand):
    args = 'filename'
    help = 'Load emissions and episodes from a Spip dump file'

fred's avatar
fred committed
77
78
79
80
81
82
    option_list = BaseCommand.option_list + (
            make_option('--no-updates',
                action='store_true',
                dest='dont_update',
                default=False,
                help='Only create new objects, do not update existing ones'),
83
84
85
86
87
            make_option('--stats-skipped',
                dest='stats_skipped_file',
                metavar='FILE',
                default=None,
                help='Create a CSV file ith skipped articles'),
88
89
90
91
92
        make_option('--rewritemap',
            dest='rewritemap_file',
            metavar='FILE',
            default=None,
            help='Create a Apache RewriteMap'),
fred's avatar
fred committed
93
94
    )

95
    def handle(self, filename, dont_update, rewritemap_file, stats_skipped_file, **options):
fred's avatar
fred committed
96
        self.do_updates = (not dont_update)
97
98
        if rewritemap_file:
            self.rewritemap = []
99
100
101
102
        if stats_skipped_file:
            self.stats_skipped_file = file(stats_skipped_file, 'w')
        else:
            self.stats_skipped_file = None
103
104
105
106
107
108
109
        with open(filename) as fd:
            content = fd.read()
            # the spip_courriers parts of the spip export are not properly
            # encoded, we manually remove them here so the XML file can be
            # parsed correctly.
            content = content[:content.find('<spip_courriers>')] + \
                      content[content.rfind('</spip_courriers>')+17:]
110
            self.root = ET.fromstring(content)
111

112
113
114
            self.load_keyword_groups()
            self.load_keywords()
            self.load_rubrics()
115
116
117

            emission_rubric_ids = []
            straight_emission_rubric_ids = []
118
            for rubric in self.rubrics['2'].rubrics.values(): # 'Les emissions'
119
120
121
122
123
                emission_rubric_ids.append(rubric.id_rubrique)
                straight_emission_rubric_ids.append(rubric.id_rubrique)
                for subrubric in rubric.rubrics.values():
                    emission_rubric_ids.append(subrubric.id_rubrique)

124
125
            self.load_breves()
            self.load_articles(emission_rubric_ids)
126

127
            self.set_urls()
128

129
130
            self.load_documents()
            self.load_document_links()
131

fred's avatar
fred committed
132
133
134
            if self.do_updates:
                self.process_emission_keywords()
                self.process_episode_keywords()
135

136
137
138
            for emission_id in straight_emission_rubric_ids:
                rubric = self.rubrics[emission_id]
                emission = self.get_or_create_emission(rubric)
139
140
                if rewritemap_file:
                    self.add_rewritemap_entries(rubric, emission)
141
142
143
144

                for article in rubric.articles.values():
                    episode = self.get_or_create_episode(article, emission)
                    if episode is None:
145
                        continue
146
147
                    if rewritemap_file:
                        self.add_rewritemap_entries(article, episode)
148

149
150
151
152
                    self.set_sound_files(article, episode)

            for breve in self.breves.values():
                newsitem = self.get_or_create_newsitem(breve)
153
154
155
156
157
                if rewritemap_file:
                    self.add_rewritemap_entries(breve, newsitem)

        if rewritemap_file:
            self.write_rewritemap(rewritemap_file)
158

159
160
161
        if self.stats_skipped_file:
            self.stats_skipped_file.close()

162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    def load_keyword_groups(self):
        self.keyword_groups = {}
        for keywordgroup_xml in self.root.iter('spip_groupes_mots'):
            keyword_group = KeywordGroup()
            for attr in ('id_groupe', 'titre'):
                setattr(keyword_group, attr, keywordgroup_xml.find(attr).text)
            if keyword_group.id_groupe not in ('11', # archives
                                               '12', # subjects
                                               '3',  # category
                                               '10', # transversal
                                               ):
                continue
            self.keyword_groups[keyword_group.id_groupe] = keyword_group

    def load_keywords(self):
        self.keywords = {}
        for keyword_xml in self.root.iter('spip_mots'):
            keyword = Keyword()
            for attr in ('id_mot', 'titre', 'id_groupe'):
                setattr(keyword, attr, keyword_xml.find(attr).text)
            if not keyword.id_groupe in self.keyword_groups:
                continue
            if keyword.id_mot in ('92',): # blacklist
                continue
            self.keywords[keyword.id_mot] = keyword
            self.keyword_groups[keyword.id_groupe] = keyword

    def load_rubrics(self):
        self.rubrics = {}
        for rubric_xml in self.root.iter('spip_rubriques'):
            rubric = Rubric()
            for attr in ('id_rubrique', 'id_parent', 'titre',
                         'descriptif', 'texte'):
                setattr(rubric, attr, rubric_xml.find(attr).text)
            self.rubrics[rubric.id_rubrique] = rubric

        for rubric in self.rubrics.values():
            if rubric.id_parent and rubric.id_parent != '0':
                self.rubrics[rubric.id_parent].rubrics[rubric.id_rubrique] = rubric

    def load_breves(self):
        self.breves = {}
        for breve_xml in self.root.iter('spip_breves'):
            breve = Breve()
            for attr in ('id_breve', 'titre', 'texte',
                         'date_heure', 'statut'):
                setattr(breve, attr, breve_xml.find(attr).text)
            if breve.statut != 'publie':
                continue
            self.breves[breve.id_breve] = breve

    def load_articles(self, emission_rubric_ids):
        self.articles = {}
        for article_xml in self.root.iter('spip_articles'):
            if article_xml.find('id_rubrique').text == '65':
                pass # rubric for events, handle with care
            elif not article_xml.find('id_rubrique').text in emission_rubric_ids:
                continue
            article = Article()
            for attr in ('id_rubrique', 'id_article', 'titre', 'surtitre',
                         'soustitre', 'descriptif', 'chapo', 'texte',
                         'date_redac', 'statut', 'date'):
                setattr(article, attr, article_xml.find(attr).text)

            if article.id_rubrique == '65':
                # this is an event, they get a special handling, to be
                # merged with newsitems
                if article.statut not in ('publie', 'prop'):
230
                    continue
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
                breve = Breve()
                breve.id_breve = '0%s' % article.id_article
                breve.titre = article.titre
                breve.texte = article.texte
                breve.date_heure = article.date
                self.breves[breve.id_breve] = breve
                continue

            if article.statut != 'publie':
                continue
            article.mots_cles = []
            self.articles[article.id_article] = article

            if self.rubrics[article.id_rubrique].id_parent != '2':
                # the spip structure didn't really expect subrubrics in the
                # 'emissions' section, but people added some nevertheless,
                # move related articles to their parent rubric.
                article.id_rubrique = self.rubrics[article.id_rubrique].id_parent

            self.rubrics[article.id_rubrique].articles[article.id_article] = article

    def set_urls(self):
        for spip_url_xml in self.root.iter('spip_urls'):
            id_objet = spip_url_xml.find('id_objet').text
            url = spip_url_xml.find('url').text
            if spip_url_xml.find('type').text == 'article' and id_objet in self.articles:
                self.articles[id_objet].url = url
            elif spip_url_xml.find('type').text == 'article' and ('0%s' % id_objet) in self.breves:
                self.breves['0' + id_objet].url = url
            elif spip_url_xml.find('type').text == 'rubrique' and id_objet in self.rubrics:
                self.rubrics[id_objet].url = url
            elif spip_url_xml.find('type').text == 'mot' and id_objet in self.keywords:
                self.keywords[id_objet].url = url
            elif spip_url_xml.find('type').text == 'breve' and id_objet in self.breves:
                self.breves[id_objet].url = url

    def load_documents(self):
        self.documents = {}
        for spip_doc_xml in self.root.iter('spip_documents'):
            id_document = spip_doc_xml.find('id_document').text
            doc = Document()
            doc.filename = spip_doc_xml.find('fichier').text
            doc.title = spip_doc_xml.find('titre').text
            if spip_doc_xml.find('distant').text == 'oui':
                url = doc.filename
                doc.filename = os.path.split(url)[-1]
                filename = os.path.join('media/IMG/', doc.filename)
                if not os.path.exists('media/IMG'):
279
                    continue
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
                if not os.path.exists(filename):
                    fd = file(filename, 'w')
                    fd.write(urllib2.urlopen(url).read())
                    fd.close()
            self.documents[id_document] = doc

    def load_document_links(self):
        self.attached_documents = {}
        for spip_doc_liens_xml in self.root.iter('spip_documents_liens'):
            id_document = spip_doc_liens_xml.find('id_document').text
            id_object = spip_doc_liens_xml.find('id_objet').text
            if spip_doc_liens_xml.find('objet').text != 'article':
                continue
            if not self.attached_documents.get(id_object):
                self.attached_documents[id_object] = []
            self.attached_documents[id_object].append(self.documents.get(id_document))

    def process_emission_keywords(self):
        for rubrickeyword_xml in self.root.iter('spip_mots_rubriques'):
            keyword_id = rubrickeyword_xml.find('id_mot').text
            rubric_id = rubrickeyword_xml.find('id_rubrique').text
            rubric = self.rubrics.get(rubric_id)
            if not rubric:
                continue

            if keyword_id == '100': # archive
                rubric.archived = True
                continue

            keyword = self.keywords.get(keyword_id)
            if keyword is None:
                continue

            if keyword.id_groupe == '3': # category
                rubric.categories.append(keyword)
                if not keyword.related_object:
                    cs = Category.objects.filter(title=keyword.titre)
                    if len(cs):
                        c = cs[0]
                    else:
                        c = Category()
                    c.title = keyword.titre
                    c.save()
                    keyword.related_object = c

    def process_episode_keywords(self):
        for articlekeyword_xml in self.root.iter('spip_mots_articles'):
            keyword_id = articlekeyword_xml.find('id_mot').text
            article_id = articlekeyword_xml.find('id_article').text
            article = self.articles.get(article_id)
            if not article:
                continue

            keyword = self.keywords.get(keyword_id)
            if keyword is None:
                continue

            if keyword.id_groupe in ('10', '12'): # transversales & sujets
                article.mots_cles.append(keyword.titre)

    def get_or_create_emission(self, rubric):
        slug = rubric.url.lower()
342
343
        possible_slugs = [rubric.url.lower(), rubric.url.lower().split(',')[0]]
        for slug in possible_slugs:
344
345
            try:
                emission = Emission.objects.get(slug=slug)
346
                break
347
            except Emission.DoesNotExist:
348
349
350
351
352
                continue
        else:
            emission = Emission()

        emission.slug = possible_slugs[-1]
fred's avatar
fred committed
353
354
355
356

        if emission.id and not self.do_updates:
            return emission

357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
        slug = slug.split(',')[0]
        emission.slug = slug
        emission.title = rubric.titre
        emission.archived = rubric.archived
        emission.description = makeHtmlFromSpip(rubric.descriptif,
                        documents=self.documents) or None
        emission.text = makeHtmlFromSpip(rubric.texte,
                        documents=self.documents) or None

        image_path = None
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)):
                image_path = ['media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)]
                break
        else:
            if emission.text:
                image_path = re.findall('src="/(media/IMG.*?)"', emission.text, re.DOTALL)
            elif emission.description:
                image_path = re.findall('src="/(media/IMG.*?)"', emission.description, re.DOTALL)

        self.set_image(emission, image_path)

        emission.save()
        emission.categories.clear()
        for category in rubric.categories:
            emission.categories.add(category.related_object)
        emission.save()
        return emission

    def get_or_create_episode(self, article, emission):
        if article.date_redac == '0000-00-00 00:00:00':
            # date_redac was used for the diffusion date, if it's
            # not set it's probably not really an episode
390
391
392
393
394
395
396
397
398
399
            if self.stats_skipped_file:
                episode_files = self.attached_documents.get(article.id_article)
                if episode_files:
                    has_sound = u'♫'
                else:
                    has_sound = '-'
                base_spip_edit_url = 'http://www.radiopanik.org/spip/ecrire/?exec=articles_edit&id_article='
                print >> self.stats_skipped_file, unicode('%s\t%s\t%s\t%s%s' % (
                    emission.title, article.titre, has_sound,
                    base_spip_edit_url, article.id_article)).encode('utf-8')
400
401
            return None

402
403
        possible_slugs = [article.url.lower()]
        if article.url.lower().startswith('nouvel-article'):
404
            possible_slugs.append(slugify(unicode(article.titre)) + '-%s' % article.id_article)
405
406
407
408
409
410
411
412

        for slug in possible_slugs:
            try:
                episode = Episode.objects.get(slug=slug)
                break
            except Episode.DoesNotExist:
                continue
        else:
413
            episode = Episode()
414
415

        episode.slug = possible_slugs[-1]
fred's avatar
fred committed
416
417
418
419

        if episode.id and not self.do_updates:
            return episode

420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
        episode.emission = emission
        episode.title = article.titre
        episode.description = makeHtmlFromSpip(article.descriptif,
                        documents=self.documents) or None
        episode.text = makeHtmlFromSpip(article.texte,
                        documents=self.documents) or None

        image_path = None
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/arton%s%s' % (article.id_article, ext)):
                image_path = ['media/IMG/arton%s%s' % (article.id_article, ext)]
                break
        else:
            if episode.text:
                image_path = re.findall('src="/(media/IMG.*?)"', episode.text, re.DOTALL)
            elif episode.description:
                image_path = re.findall('src="/(media/IMG.*?)"', episode.description, re.DOTALL)

        self.set_image(episode, image_path)

440
441
        episode.save()

442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
        for motcle in article.mots_cles:
            episode.tags.add(motcle.lower())

        if not Diffusion.objects.filter(episode=episode).count():
            diffusion = Diffusion()
            diffusion.episode = episode
            try:
                diffusion.datetime = datetime.strptime(article.date_redac, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                pass
            else:
                diffusion.save()

        return episode

    def set_sound_files(self, article, episode):
        if SoundFile.objects.filter(episode=episode).count():
            return # skip episodes that already have sound files
        episode_files = self.attached_documents.get(article.id_article)
        if episode_files:
            for episode_file in episode_files:
                if episode_file is None:
fred's avatar
fred committed
464
                    continue
465
                if os.path.splitext(episode_file.filename)[-1] not in ('.ogg', '.mp3'):
fred's avatar
fred committed
466
                    continue
467
468
469
470
471
472
473
                if not os.path.exists('media/IMG/' + episode_file.filename):
                    continue
                soundfile = SoundFile()
                soundfile.episode = episode
                soundfile.podcastable = True
                soundfile.fragment = False
                soundfile.title = episode_file.title or '[pas de titre]'
474
475
476
477
                sound_path = os.path.join(settings.MEDIA_ROOT,
                        get_sound_path(soundfile, episode_file.filename))
                if os.path.exists(sound_path):
                    os.unlink(sound_path)
478
479
480
481
482
483
484
485
486
                soundfile.file = File(file('media/IMG/' + episode_file.filename))
                soundfile.save()

    def get_or_create_newsitem(self, breve):
        slug = breve.url.lower()
        try:
            newsitem = NewsItem.objects.get(slug=slug)
        except NewsItem.DoesNotExist:
            newsitem = NewsItem()
fred's avatar
fred committed
487
488
489
490

        if newsitem.id and not self.do_updates:
            return newsitem

491
492
493
494
495
496
497
498
499
500
501
502
503
        newsitem.title = breve.titre
        newsitem.slug = slug
        newsitem.text = makeHtmlFromSpip(breve.texte,
                documents=self.documents) or None
        newsitem.datetime = datetime.strptime(breve.date_heure, '%Y-%m-%d %H:%M:%S')
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/breveon%s%s' % (breve.id_breve, ext)):
                image_path = ['media/IMG/breveon%s%s' % (breve.id_breve, ext)]
                break
        else:
            image_path = re.findall('src="/(media/IMG.*?)"', newsitem.text, re.DOTALL)
        self.set_image(newsitem, image_path)
        newsitem.save()
504
505
506
507
508
509
510
511

    def set_image(self, object, image_path):
        if not image_path:
            return
        image_path = image_path[0]
        if object.image and os.path.basename(object.image.path) == os.path.basename(image_path):
            return
        object.image = File(file(image_path))
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544

    def add_rewritemap_entries(self, spip_object, django_object):
        if isinstance(django_object, Emission):
            object_url = reverse('emission-view', kwargs={'slug': django_object.slug})
        elif isinstance(django_object, Episode):
            object_url = reverse('episode-view', kwargs={
                'slug': django_object.slug, 'emission_slug': django_object.emission.slug})
        elif isinstance(django_object, NewsItem):
            object_url = reverse('news-view', kwargs={'slug': django_object.slug})
        else:
            return

        if spip_object.id[0] == '0':
            # our hack mapping some articles to newsitems
            spip_object.spip_url_object_name = 'article'
            spip_object.spip_url_marker = ''

        urls = []
        urls.append('%s%s' % (spip_object.spip_url_object_name, spip_object.id))
        urls.append('spip.php?%s%s' % (spip_object.spip_url_object_name, spip_object.id))
        urls.append('%s%s.html' % (spip_object.spip_url_object_name, spip_object.id))
        if spip_object.spip_url_object_name == 'article':
            urls.append(spip_object.id)
        urls.append('%s%s%s' % (spip_object.spip_url_marker, spip_object.url, spip_object.spip_url_marker))

        for url in urls:
            self.rewritemap.append((url, object_url))

    def write_rewritemap(self, rewritemap_file):
        fd = file(rewritemap_file, 'w')
        for src, dst in self.rewritemap:
            print >> fd, src, dst
        fd.close()