load-from-spip.py 5.16 KB
Newer Older
1
2
3
4
5
6
7
8
9
from datetime import datetime
import gzip
import xml.etree.ElementTree as ET

from django.core.management.base import BaseCommand, CommandError

from panikdb.emissions.models import Emission, Episode, Diffusion


10
11
12
from _spip2html import makeHtmlFromSpip


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class Rubric(object):
    def __init__(self):
        self.articles = {}
        self.rubrics = {}


class Article(object):
    pass


class Command(BaseCommand):
    args = 'filename'
    help = 'Load emissions and episodes from a Spip dump file'

    def handle(self, filename, *args, **options):
        rubrics = {}
        articles = {}

        with open(filename) as fd:
            content = fd.read()
            # the spip_courriers parts of the spip export are not properly
            # encoded, we manually remove them here so the XML file can be
            # parsed correctly.
            content = content[:content.find('<spip_courriers>')] + \
                      content[content.rfind('</spip_courriers>')+17:]
            root = ET.fromstring(content)
            for rubric_xml in root.iter('spip_rubriques'):
                rubric = Rubric()
                for attr in ('id_rubrique', 'id_parent', 'titre', 'texte'):
                    setattr(rubric, attr, rubric_xml.find(attr).text)
                rubrics[rubric.id_rubrique] = rubric
            for rubric in rubrics.values():
                if rubric.id_parent and rubric.id_parent != '0':
                    rubrics[rubric.id_parent].rubrics[rubric.id_rubrique] = rubric

            emission_rubric_ids = []
            straight_emission_rubric_ids = []
            for rubric in rubrics['2'].rubrics.values(): # 'Les emissions'
                emission_rubric_ids.append(rubric.id_rubrique)
                straight_emission_rubric_ids.append(rubric.id_rubrique)
                for subrubric in rubric.rubrics.values():
                    emission_rubric_ids.append(subrubric.id_rubrique)

            for article_xml in root.iter('spip_articles'):
                if not article_xml.find('id_rubrique').text in emission_rubric_ids:
                    continue
                article = Article()
                for attr in ('id_rubrique', 'id_article', 'titre', 'surtitre',
                             'soustitre', 'descriptif', 'chapo', 'texte',
                             'date_redac', 'statut'):
                    setattr(article, attr, article_xml.find(attr).text)
                if article.statut != 'publie':
                    continue
                articles[article.id_article] = article

                if rubrics[article.id_rubrique].id_parent != '2':
                    # the spip structure didn't really expect subrubrics in the
                    # 'emissions' section, but people added some nevertheless,
                    # move related articles to their parent rubric.
                    article.id_rubrique = rubrics[article.id_rubrique].id_parent

                rubrics[article.id_rubrique].articles[article.id_article] = article

            for spip_url_xml in root.iter('spip_urls'):
                id_objet = spip_url_xml.find('id_objet').text
                if spip_url_xml.find('type').text == 'article' and id_objet in articles:
                    articles[spip_url_xml.find('id_objet').text].url = spip_url_xml.find('url').text
                elif spip_url_xml.find('type').text == 'rubrique' and id_objet in rubrics:
                    rubrics[spip_url_xml.find('id_objet').text].url = spip_url_xml.find('url').text

            for emission_id in straight_emission_rubric_ids:
                rubric = rubrics[emission_id]
                slug = rubric.url.lower()
                try:
                    emission = Emission.objects.get(slug=slug)
                except Emission.DoesNotExist:
                    emission = Emission()
                    emission.slug = slug
                emission.title = rubric.titre
92
                emission.description = makeHtmlFromSpip(rubric.texte)
93
94
95
96
97
98
99
                emission.save()

                for article in rubric.articles.values():
                    if article.date_redac == '0000-00-00 00:00:00':
                        # date_redac was used for the diffusion date, if it's
                        # not set it's probably not really an episode
                        continue
100
                    slug = article.url.lower()
101
102
103
104
105
106
107
                    try:
                        episode = Episode.objects.get(slug=slug)
                    except Episode.DoesNotExist:
                        episode = Episode()
                        episode.slug = slug
                    episode.emission = emission
                    episode.title = article.titre
108
                    episode.description = makeHtmlFromSpip(article.texte)
109
110
111
112
113
114
115
116
117
118
119
                    episode.save()

                    if not Diffusion.objects.filter(episode=episode).count():
                        diffusion = Diffusion()
                        diffusion.episode = episode
                        try:
                            diffusion.datetime = datetime.strptime(article.date_redac, '%Y-%m-%d %H:%M:%S')
                        except ValueError:
                            pass
                        else:
                            diffusion.save()