rss_generator.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

import os
from time import strftime, strptime, ctime
from siteconfig import siteconfig
from view_functions import is_hidden_path


class RSS_Item:
    """
    RSS_Item - a (very) basic implementation of an object in an RSS
    feed using only essential parameters as specified in:
    https://www.rssboard.org/rss-specification#hrelementsOfLtitemgt
    
    Item data is generated from a given file path
    """
    PARAGRAPHS = siteconfig.rss_channel_config['DESCRIPTION_LENGTH']

    class NotAFile(Exception):
        """
        Throws an exception if an RSS_Item is made out of a
        directory or invalid file
        """
        def __init__(self, path: str):
            self.path = path
            self.message = f"{path} not a file"
            super().__init__(self.message)

    def __init__(self, path: str):
        if not os.path.isfile(path):
            raise self.NotAFile(path)

        self.FULL_PATH = path
        self.TITLE = path.rsplit('.', 1)[0].split('/')[-1]
        self.FILE_TYPE = path.rsplit('.', 1)[1]
        self.DESCRIPTION = self.parse_file()
        self.LAST_UPDATE = self.file_last_modified()
        self.URI = self.get_uri()
        self.LINK = siteconfig.rss_channel_config['LINK'] + self.URI

    def __str__(self):
        return "<RSS_Item at {} - {}, {}>".format(
            self.FULL_PATH, self.TITLE, self.short_timestamp()
        )

    def short_timestamp(self):
        return strftime("%Y-%m-%d %H:%M %z", strptime(self.LAST_UPDATE))

    def parse_file(self):
        """
        parse_file - reads the file at FULL_PATH and saves the content
        from when the first <p> tag is hit up to and including the
        closing </p> tag. Other files are interpreted as text files
        and, just reads the first 3 paragraphs (two new lines in a row)
        """
        with open(self.FULL_PATH) as f:
            in_body = False
            paragraphs = 0
            description = ""
            for line in f.readlines():
                if paragraphs >= self.PARAGRAPHS:
                    break
                line = line.strip()
                if self.FILE_TYPE in ['html', 'html!']:
                    if line.startswith("<p>"):
                        in_body = True
                    if in_body:
                        description += line
                    if line.endswith("</p>"):
                        in_body = False
                        paragraphs += 1
                else:
                    description += line
                    # remember, we stripped the line
                    if line == '':
                        paragraphs += 1

        return ''.join(description)

    def file_last_modified(self):
        return ctime(os.stat(self.FULL_PATH).st_ctime)

    def get_uri(self):
        # return everything after "./templates/"
        return '/'.join(self.FULL_PATH.split('/')[2:])


def get_rss_channel():
    """
    get_rss_channel - list all files from the BASE_DIR, and if allowed,
    add them as RSS_Items to populate feed.xml. Called by feed.xml view
    """
    items = []
    extensions = siteconfig.rss_channel_config['RSS_FILE_EXT']
    for root, dirs, files in os.walk(siteconfig.BASE_DIR):
        for f in files:
            # remember, path will be like "./templates/site/..."
            path = os.path.join(root, f)
            if (
                path.split(".")[-1] in extensions
                and path not in siteconfig.RSS_OMIT
                and not is_hidden_path(path.split('.', 1)[1])
            ):
                items.append(RSS_Item(path))
    return items