load-from-spip.py 20.7 KB
Newer Older
1 2
# -*- coding: utf-8 -*-

3
from datetime import datetime
4
import time
5 6
import gzip
import xml.etree.ElementTree as ET
7
import os
fred's avatar
fred committed
8
import re
9
import urllib2
10

fred's avatar
fred committed
11 12
from optparse import make_option

13
from django.conf import settings
14
from django.core.files import File
15
from django.core.management.base import BaseCommand, CommandError
16
from django.core.urlresolvers import reverse
fred's avatar
fred committed
17
from django.utils.text import slugify
18

fred's avatar
fred committed
19
from ...models import Emission, Episode, Diffusion, Category, SoundFile, NewsItem
20
from ...models import get_sound_path
21 22


23 24 25
from _spip2html import makeHtmlFromSpip


26
class Rubric(object):
27 28
    archived = False

29 30 31
    spip_url_marker = '-'
    spip_url_object_name = 'rubrique'

32 33 34
    def __init__(self):
        self.articles = {}
        self.rubrics = {}
35
        self.categories = []
36

37 38 39 40
    @property
    def id(self):
        return self.id_rubrique

41 42

class Article(object):
43 44 45 46 47 48
    spip_url_marker = ''
    spip_url_object_name = 'article'

    @property
    def id(self):
        return self.id_article
49 50


fred's avatar
fred committed
51
class Breve(object):
52 53 54 55 56 57
    spip_url_marker = '+'
    spip_url_object_name = 'breve'

    @property
    def id(self):
        return self.id_breve
fred's avatar
fred committed
58 59


60 61 62 63 64 65 66 67 68
class KeywordGroup(object):
    def __init__(self):
        self.keywords = {}


class Keyword(object):
    related_object = None


69 70 71 72
class Document(object):
    pass


73 74 75 76
class Command(BaseCommand):
    args = 'filename'
    help = 'Load emissions and episodes from a Spip dump file'

fred's avatar
fred committed
77 78 79 80 81 82
    option_list = BaseCommand.option_list + (
            make_option('--no-updates',
                action='store_true',
                dest='dont_update',
                default=False,
                help='Only create new objects, do not update existing ones'),
83 84 85 86 87
            make_option('--stats-skipped',
                dest='stats_skipped_file',
                metavar='FILE',
                default=None,
                help='Create a CSV file ith skipped articles'),
88 89 90 91 92
        make_option('--rewritemap',
            dest='rewritemap_file',
            metavar='FILE',
            default=None,
            help='Create a Apache RewriteMap'),
fred's avatar
fred committed
93 94
    )

95
    def handle(self, filename, dont_update, rewritemap_file, stats_skipped_file, **options):
fred's avatar
fred committed
96
        self.do_updates = (not dont_update)
97 98
        if rewritemap_file:
            self.rewritemap = []
99 100 101 102
        if stats_skipped_file:
            self.stats_skipped_file = file(stats_skipped_file, 'w')
        else:
            self.stats_skipped_file = None
103 104 105 106 107 108 109
        with open(filename) as fd:
            content = fd.read()
            # the spip_courriers parts of the spip export are not properly
            # encoded, we manually remove them here so the XML file can be
            # parsed correctly.
            content = content[:content.find('<spip_courriers>')] + \
                      content[content.rfind('</spip_courriers>')+17:]
110
            self.root = ET.fromstring(content)
111

112 113 114
            self.load_keyword_groups()
            self.load_keywords()
            self.load_rubrics()
115 116 117

            emission_rubric_ids = []
            straight_emission_rubric_ids = []
118
            for rubric in self.rubrics['2'].rubrics.values(): # 'Les emissions'
119 120 121 122 123
                emission_rubric_ids.append(rubric.id_rubrique)
                straight_emission_rubric_ids.append(rubric.id_rubrique)
                for subrubric in rubric.rubrics.values():
                    emission_rubric_ids.append(subrubric.id_rubrique)

124 125
            self.load_breves()
            self.load_articles(emission_rubric_ids)
126

127
            self.set_urls()
128

129 130
            self.load_documents()
            self.load_document_links()
131

fred's avatar
fred committed
132 133 134
            if self.do_updates:
                self.process_emission_keywords()
                self.process_episode_keywords()
135

136 137 138
            for emission_id in straight_emission_rubric_ids:
                rubric = self.rubrics[emission_id]
                emission = self.get_or_create_emission(rubric)
139 140
                if rewritemap_file:
                    self.add_rewritemap_entries(rubric, emission)
141 142 143 144

                for article in rubric.articles.values():
                    episode = self.get_or_create_episode(article, emission)
                    if episode is None:
145
                        continue
146 147
                    if rewritemap_file:
                        self.add_rewritemap_entries(article, episode)
148

149 150 151 152
                    self.set_sound_files(article, episode)

            for breve in self.breves.values():
                newsitem = self.get_or_create_newsitem(breve)
153 154 155 156 157
                if rewritemap_file:
                    self.add_rewritemap_entries(breve, newsitem)

        if rewritemap_file:
            self.write_rewritemap(rewritemap_file)
158

159 160 161
        if self.stats_skipped_file:
            self.stats_skipped_file.close()

162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
    def load_keyword_groups(self):
        self.keyword_groups = {}
        for keywordgroup_xml in self.root.iter('spip_groupes_mots'):
            keyword_group = KeywordGroup()
            for attr in ('id_groupe', 'titre'):
                setattr(keyword_group, attr, keywordgroup_xml.find(attr).text)
            if keyword_group.id_groupe not in ('11', # archives
                                               '12', # subjects
                                               '3',  # category
                                               '10', # transversal
                                               ):
                continue
            self.keyword_groups[keyword_group.id_groupe] = keyword_group

    def load_keywords(self):
        self.keywords = {}
        for keyword_xml in self.root.iter('spip_mots'):
            keyword = Keyword()
            for attr in ('id_mot', 'titre', 'id_groupe'):
                setattr(keyword, attr, keyword_xml.find(attr).text)
            if not keyword.id_groupe in self.keyword_groups:
                continue
            if keyword.id_mot in ('92',): # blacklist
                continue
            self.keywords[keyword.id_mot] = keyword
            self.keyword_groups[keyword.id_groupe] = keyword

    def load_rubrics(self):
        self.rubrics = {}
        for rubric_xml in self.root.iter('spip_rubriques'):
            rubric = Rubric()
            for attr in ('id_rubrique', 'id_parent', 'titre',
                         'descriptif', 'texte'):
                setattr(rubric, attr, rubric_xml.find(attr).text)
            self.rubrics[rubric.id_rubrique] = rubric

        for rubric in self.rubrics.values():
            if rubric.id_parent and rubric.id_parent != '0':
                self.rubrics[rubric.id_parent].rubrics[rubric.id_rubrique] = rubric

    def load_breves(self):
        self.breves = {}
        for breve_xml in self.root.iter('spip_breves'):
            breve = Breve()
            for attr in ('id_breve', 'titre', 'texte',
                         'date_heure', 'statut'):
                setattr(breve, attr, breve_xml.find(attr).text)
            if breve.statut != 'publie':
                continue
            self.breves[breve.id_breve] = breve

    def load_articles(self, emission_rubric_ids):
        self.articles = {}
        for article_xml in self.root.iter('spip_articles'):
            if article_xml.find('id_rubrique').text == '65':
                pass # rubric for events, handle with care
            elif not article_xml.find('id_rubrique').text in emission_rubric_ids:
                continue
            article = Article()
            for attr in ('id_rubrique', 'id_article', 'titre', 'surtitre',
                         'soustitre', 'descriptif', 'chapo', 'texte',
                         'date_redac', 'statut', 'date'):
                setattr(article, attr, article_xml.find(attr).text)

            if article.id_rubrique == '65':
                # this is an event, they get a special handling, to be
                # merged with newsitems
                if article.statut not in ('publie', 'prop'):
230
                    continue
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
                breve = Breve()
                breve.id_breve = '0%s' % article.id_article
                breve.titre = article.titre
                breve.texte = article.texte
                breve.date_heure = article.date
                self.breves[breve.id_breve] = breve
                continue

            if article.statut != 'publie':
                continue
            article.mots_cles = []
            self.articles[article.id_article] = article

            if self.rubrics[article.id_rubrique].id_parent != '2':
                # the spip structure didn't really expect subrubrics in the
                # 'emissions' section, but people added some nevertheless,
                # move related articles to their parent rubric.
                article.id_rubrique = self.rubrics[article.id_rubrique].id_parent

            self.rubrics[article.id_rubrique].articles[article.id_article] = article

    def set_urls(self):
        for spip_url_xml in self.root.iter('spip_urls'):
            id_objet = spip_url_xml.find('id_objet').text
            url = spip_url_xml.find('url').text
            if spip_url_xml.find('type').text == 'article' and id_objet in self.articles:
                self.articles[id_objet].url = url
            elif spip_url_xml.find('type').text == 'article' and ('0%s' % id_objet) in self.breves:
                self.breves['0' + id_objet].url = url
            elif spip_url_xml.find('type').text == 'rubrique' and id_objet in self.rubrics:
                self.rubrics[id_objet].url = url
            elif spip_url_xml.find('type').text == 'mot' and id_objet in self.keywords:
                self.keywords[id_objet].url = url
            elif spip_url_xml.find('type').text == 'breve' and id_objet in self.breves:
                self.breves[id_objet].url = url

    def load_documents(self):
        self.documents = {}
        for spip_doc_xml in self.root.iter('spip_documents'):
            id_document = spip_doc_xml.find('id_document').text
            doc = Document()
            doc.filename = spip_doc_xml.find('fichier').text
            doc.title = spip_doc_xml.find('titre').text
            if spip_doc_xml.find('distant').text == 'oui':
                url = doc.filename
                doc.filename = os.path.split(url)[-1]
                filename = os.path.join('media/IMG/', doc.filename)
                if not os.path.exists('media/IMG'):
279
                    continue
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
                if not os.path.exists(filename):
                    fd = file(filename, 'w')
                    fd.write(urllib2.urlopen(url).read())
                    fd.close()
            self.documents[id_document] = doc

    def load_document_links(self):
        self.attached_documents = {}
        for spip_doc_liens_xml in self.root.iter('spip_documents_liens'):
            id_document = spip_doc_liens_xml.find('id_document').text
            id_object = spip_doc_liens_xml.find('id_objet').text
            if spip_doc_liens_xml.find('objet').text != 'article':
                continue
            if not self.attached_documents.get(id_object):
                self.attached_documents[id_object] = []
            self.attached_documents[id_object].append(self.documents.get(id_document))

    def process_emission_keywords(self):
        for rubrickeyword_xml in self.root.iter('spip_mots_rubriques'):
            keyword_id = rubrickeyword_xml.find('id_mot').text
            rubric_id = rubrickeyword_xml.find('id_rubrique').text
            rubric = self.rubrics.get(rubric_id)
            if not rubric:
                continue

            if keyword_id == '100': # archive
                rubric.archived = True
                continue

            keyword = self.keywords.get(keyword_id)
            if keyword is None:
                continue

            if keyword.id_groupe == '3': # category
                rubric.categories.append(keyword)
                if not keyword.related_object:
                    cs = Category.objects.filter(title=keyword.titre)
                    if len(cs):
                        c = cs[0]
                    else:
                        c = Category()
                    c.title = keyword.titre
                    c.save()
                    keyword.related_object = c

    def process_episode_keywords(self):
        for articlekeyword_xml in self.root.iter('spip_mots_articles'):
            keyword_id = articlekeyword_xml.find('id_mot').text
            article_id = articlekeyword_xml.find('id_article').text
            article = self.articles.get(article_id)
            if not article:
                continue

            keyword = self.keywords.get(keyword_id)
            if keyword is None:
                continue

            if keyword.id_groupe in ('10', '12'): # transversales & sujets
                article.mots_cles.append(keyword.titre)

    def get_or_create_emission(self, rubric):
        slug = rubric.url.lower()
342 343
        possible_slugs = [rubric.url.lower(), rubric.url.lower().split(',')[0]]
        for slug in possible_slugs:
344 345
            try:
                emission = Emission.objects.get(slug=slug)
346
                break
347
            except Emission.DoesNotExist:
348 349 350 351 352
                continue
        else:
            emission = Emission()

        emission.slug = possible_slugs[-1]
fred's avatar
fred committed
353 354 355 356

        if emission.id and not self.do_updates:
            return emission

357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
        slug = slug.split(',')[0]
        emission.slug = slug
        emission.title = rubric.titre
        emission.archived = rubric.archived
        emission.description = makeHtmlFromSpip(rubric.descriptif,
                        documents=self.documents) or None
        emission.text = makeHtmlFromSpip(rubric.texte,
                        documents=self.documents) or None

        image_path = None
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)):
                image_path = ['media/IMG/rubon%s%s' % (rubric.id_rubrique, ext)]
                break
        else:
            if emission.text:
                image_path = re.findall('src="/(media/IMG.*?)"', emission.text, re.DOTALL)
            elif emission.description:
                image_path = re.findall('src="/(media/IMG.*?)"', emission.description, re.DOTALL)

        self.set_image(emission, image_path)

        emission.save()
        emission.categories.clear()
        for category in rubric.categories:
            emission.categories.add(category.related_object)
        emission.save()
        return emission

    def get_or_create_episode(self, article, emission):
        if article.date_redac == '0000-00-00 00:00:00':
            # date_redac was used for the diffusion date, if it's
            # not set it's probably not really an episode
390 391 392 393 394 395 396 397 398 399
            if self.stats_skipped_file:
                episode_files = self.attached_documents.get(article.id_article)
                if episode_files:
                    has_sound = u'♫'
                else:
                    has_sound = '-'
                base_spip_edit_url = 'http://www.radiopanik.org/spip/ecrire/?exec=articles_edit&id_article='
                print >> self.stats_skipped_file, unicode('%s\t%s\t%s\t%s%s' % (
                    emission.title, article.titre, has_sound,
                    base_spip_edit_url, article.id_article)).encode('utf-8')
400 401
            return None

402 403
        possible_slugs = [article.url.lower()]
        if article.url.lower().startswith('nouvel-article'):
404
            possible_slugs.append(slugify(unicode(article.titre))[:40] + '-%s' % article.id_article)
405 406 407 408 409 410 411 412

        for slug in possible_slugs:
            try:
                episode = Episode.objects.get(slug=slug)
                break
            except Episode.DoesNotExist:
                continue
        else:
413
            episode = Episode()
414 415

        episode.slug = possible_slugs[-1]
fred's avatar
fred committed
416 417 418 419

        if episode.id and not self.do_updates:
            return episode

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
        episode.emission = emission
        episode.title = article.titre
        episode.description = makeHtmlFromSpip(article.descriptif,
                        documents=self.documents) or None
        episode.text = makeHtmlFromSpip(article.texte,
                        documents=self.documents) or None

        image_path = None
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/arton%s%s' % (article.id_article, ext)):
                image_path = ['media/IMG/arton%s%s' % (article.id_article, ext)]
                break
        else:
            if episode.text:
                image_path = re.findall('src="/(media/IMG.*?)"', episode.text, re.DOTALL)
            elif episode.description:
                image_path = re.findall('src="/(media/IMG.*?)"', episode.description, re.DOTALL)

        self.set_image(episode, image_path)

440 441
        episode.save()

442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
        for motcle in article.mots_cles:
            episode.tags.add(motcle.lower())

        if not Diffusion.objects.filter(episode=episode).count():
            diffusion = Diffusion()
            diffusion.episode = episode
            try:
                diffusion.datetime = datetime.strptime(article.date_redac, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                pass
            else:
                diffusion.save()

        return episode

    def set_sound_files(self, article, episode):
        if SoundFile.objects.filter(episode=episode).count():
            return # skip episodes that already have sound files
        episode_files = self.attached_documents.get(article.id_article)
        if episode_files:
            for episode_file in episode_files:
                if episode_file is None:
fred's avatar
fred committed
464
                    continue
465
                if os.path.splitext(episode_file.filename)[-1] not in ('.ogg', '.mp3'):
fred's avatar
fred committed
466
                    continue
467 468 469 470 471 472 473
                if not os.path.exists('media/IMG/' + episode_file.filename):
                    continue
                soundfile = SoundFile()
                soundfile.episode = episode
                soundfile.podcastable = True
                soundfile.fragment = False
                soundfile.title = episode_file.title or '[pas de titre]'
474 475 476 477
                sound_path = os.path.join(settings.MEDIA_ROOT,
                        get_sound_path(soundfile, episode_file.filename))
                if os.path.exists(sound_path):
                    os.unlink(sound_path)
478 479 480 481 482 483 484 485 486
                soundfile.file = File(file('media/IMG/' + episode_file.filename))
                soundfile.save()

    def get_or_create_newsitem(self, breve):
        slug = breve.url.lower()
        try:
            newsitem = NewsItem.objects.get(slug=slug)
        except NewsItem.DoesNotExist:
            newsitem = NewsItem()
fred's avatar
fred committed
487 488 489 490

        if newsitem.id and not self.do_updates:
            return newsitem

491 492 493 494 495 496 497 498 499 500 501 502 503
        newsitem.title = breve.titre
        newsitem.slug = slug
        newsitem.text = makeHtmlFromSpip(breve.texte,
                documents=self.documents) or None
        newsitem.datetime = datetime.strptime(breve.date_heure, '%Y-%m-%d %H:%M:%S')
        for ext in ('.jpg', '.png', '.gif'):
            if os.path.exists('media/IMG/breveon%s%s' % (breve.id_breve, ext)):
                image_path = ['media/IMG/breveon%s%s' % (breve.id_breve, ext)]
                break
        else:
            image_path = re.findall('src="/(media/IMG.*?)"', newsitem.text, re.DOTALL)
        self.set_image(newsitem, image_path)
        newsitem.save()
504 505 506 507 508 509 510 511

    def set_image(self, object, image_path):
        if not image_path:
            return
        image_path = image_path[0]
        if object.image and os.path.basename(object.image.path) == os.path.basename(image_path):
            return
        object.image = File(file(image_path))
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544

    def add_rewritemap_entries(self, spip_object, django_object):
        if isinstance(django_object, Emission):
            object_url = reverse('emission-view', kwargs={'slug': django_object.slug})
        elif isinstance(django_object, Episode):
            object_url = reverse('episode-view', kwargs={
                'slug': django_object.slug, 'emission_slug': django_object.emission.slug})
        elif isinstance(django_object, NewsItem):
            object_url = reverse('news-view', kwargs={'slug': django_object.slug})
        else:
            return

        if spip_object.id[0] == '0':
            # our hack mapping some articles to newsitems
            spip_object.spip_url_object_name = 'article'
            spip_object.spip_url_marker = ''

        urls = []
        urls.append('%s%s' % (spip_object.spip_url_object_name, spip_object.id))
        urls.append('spip.php?%s%s' % (spip_object.spip_url_object_name, spip_object.id))
        urls.append('%s%s.html' % (spip_object.spip_url_object_name, spip_object.id))
        if spip_object.spip_url_object_name == 'article':
            urls.append(spip_object.id)
        urls.append('%s%s%s' % (spip_object.spip_url_marker, spip_object.url, spip_object.spip_url_marker))

        for url in urls:
            self.rewritemap.append((url, object_url))

    def write_rewritemap(self, rewritemap_file):
        fd = file(rewritemap_file, 'w')
        for src, dst in self.rewritemap:
            print >> fd, src, dst
        fd.close()