CNK's Blog

Import files into Wagtail

I am building a site that is replacing an older site and I want to preserve a substantial number of PDF files. So I wrote a manage.py command to import all the files in a nested set of directories into corresponding nested collections in Wagtail. For example, given the following local directory:

  archive
      - some-file.pdf
      - 2020
        - file1.pdf
        - file2.pdf
      - 2021
        - file3.pdf

my script will create collections for 2020 and 2021 and the import 4 PDF files into the correct collections and sub-collections.

  # core/management/commands/import_documents_from_directory.py

  from django.core.exceptions import ObjectDoesNotExist
  from django.core.management import BaseCommand, CommandError
  from wagtail.models import Collection, get_root_collection_id

  from core.jobs.document_importer import DocumentImporter

  class Command(BaseCommand):
      help = "Imports all files nested under `pdf-directory` into
      corresponding collection under the given base collection."

      def add_arguments(self, parser):
          parser.add_argument(
              '--pdf-directory',
              dest='pdf_directory',
              default='/tmp/documents',
              help="Path to the local directory where the PDFs are located"
          )

          parser.add_argument(
              '--base-collection',
              dest='base_collection',
              required=False,
              help="Which collection should get these files? Will use the base collection if this is missing."
          )

          parser.add_argument(
              '--dry-run',
              action='store_true',
              dest='dry_run',
              default=False,
              help='Try not to change the database; just show what would have been done.',
          )

      def handle(self, **options):
          if options['base_collection']:
              try:
                  base_collection = Collection.objects.get(name=options['base_collection'])
              except ObjectDoesNotExist:
                  raise CommandError(f"Base collection \"{options['base_collection']}\" does not exist")
          else:
              base_collection = Collection.objects.get(pk=get_root_collection_id())

          importer = DocumentImporter()
          importer.import_all(options['pdf_directory'], base_collection, options['dry_run'])
  # core/jobs/document_importer.py

  import hashlib
  import os
  from django.core.files import File

  from wagtail.documents import get_document_model
  from wagtail.models import Collection

  from core.logging import logger


  class DocumentImporter(object):
      """
      Given a nested directory of files, import them into Wagtails documents model - preserving the
      folder structure as nested collections.
      """

      def import_all(self, pdf_directory, base_collection, dry_run=False):
          for path, file in self._get_files(pdf_directory):
              collection = self._get_collection(path, pdf_directory, base_collection, dry_run)
              self._create_document(file, path, collection, dry_run)

      def _get_files(self, root):
          """Recursively iterate all the .py files in the root directory and below"""
          for path, dirs, files in os.walk(root):
              yield from ((path, file) for file in files)

      def _get_collection(self, path, pdf_directory, base_collection, dry_run):
          """
          Construct a nested set of collections corresponding to the nested directories.
          """
          current_parent = base_collection
          rel_path = os.path.relpath(path, pdf_directory)
          for part in rel_path.split('/'):
              collection = current_parent.get_descendants().filter(name=part).first()
              if collection:
                  current_parent = collection
                  logger.info(
                      'document_importer.collection.found',
                      dry_run=dry_run,
                      name=part,
                  )
              else:
                  # create this collection
                  if not dry_run:
                      collection = Collection(name=part)
                      current_parent.add_child(instance=collection)
                      # Set this as the parent for the next node in our list
                      current_parent = collection
                  logger.info(
                      'document_importer.collection.create',
                      dry_run=dry_run,
                      name=part,
                  )
          return current_parent

      def _create_document(self, file, path, collection, dry_run):
          doc = get_document_model().objects.filter(file__endswith=file).first()
          if doc:
              op = "update"
              if dry_run:
                  self.__log_document_changes(op, file, collection, dry_run)
              else:
                  with open(f'{path}/{file}', "rb") as fd:
                      new_hash = hashlib.sha1(fd.read()).hexdigest()
                      if not new_hash == doc.file_hash:
                          doc.file = File(fd, name=file)
                          doc.file_size = len(doc.file)
                          doc.file_hash = new_hash
                          doc.save()
                          self.__log_document_changes(op, file, collection, dry_run)
                      if not collection == doc.collection:
                          doc.collection = collection
                          doc.save()
                          self.__log_document_changes(op, file, collection, dry_run)
          else:
              op = "create"
              if dry_run:
                  self.__log_document_changes(op, file, collection, dry_run)
              else:
                  with open(f'{path}/{file}', "rb") as fd:
                      doc = get_document_model()(title=file, collection=collection)
                      doc.file = File(fd, name=file)
                      doc.file_size = len(doc.file)
                      doc.file_hash = hashlib.sha1(fd.read()).hexdigest()
                      doc.save()
                      self.__log_document_changes(op, file, collection, dry_run)

      def __log_document_changes(self, op, file, collection, dry_run):
          logger.info(
              "document_importer.document.{}".format(op),
              dry_run=dry_run,
              file=file,
              collection=collection,
          )

Wagtail 3 Upgrade: Per Site Features

At work we run a large multitenant Wagtail application. Most of the time when one of our customers asks for a feature, we add it and make it available to everyone. But occasionally we get a request that we are willing to add for a specific site (or handful of sites) but do not want to make generally available. A few of our customers have interactive displays in their building and they would like to display content from their web site but don’t want to devote space to some items that are on every page - for example the header, footer, and navigation. This makes a lot of sense for this use case, but we don’t want other groups abusing this feature to opt out of our branding. So, we use feature flags to enable “bare pages” on only a few sites.

We have one code base for all our sites, but within that we have two different sets of features (such as page types and the front end look and feel). Which set of features a site gets is controlled by its theme. Because of some history, the current themes are named ‘v6.1’ and ‘v7.0.

Because every site will need a theme, every site will have a Features setting. And every request will need to start by figuring out what site it is for and then what theme it should use. To set request.site on for each request, we use a version of Wagtail’s SiteMiddleware, which is still available in wagtail.contrib.legacy. so our MIDDLEWARE setting looks something like:

    MIDDLEWARE = [
        # Django's "default" middleware, in the appropriate order according to Django 3 docs.
        ...

        # Wagtail's SiteMiddleware
        wagtail.contrib.legacy.SiteMiddleware',

        # Enables the use of the get_current_request() and get_current_user() functions.
        'crequest.middleware.CrequestMiddleware',
    ]

Our Features model looks like the code below. Please note that we have never used the ability to “disable a default feature” so if you want to copy this code, I would remove that.

    @register_setting
    class Features(BaseSetting):
        """
        This is a Settings model that has a one-to-one relationship with each Site in the system.
        It stores json blobs that configure its Site's available features. Features are defined through the
        "register_feature" hook, and can have two types:

        1) Default Features. These features ere enabled by default on all Sites, but can be explicitly disabled via the
           Features form. These include features like particular Block types.
        2) Special Features. These are features that are only used by a small subset of the Sites on a system, and are
           therefore disabled by default. They can be enabled through the Features form.
           These include features like HSS's Working Papers, or the Startup Map used by OTTCP.

        Implementing what "disabling" a Default Feature, or "enabling" a Special Feature actually means is left up to the
        code that registers the feature. This module only stores the data for which Site enables/disables which Features.
        """
        THEMES = [
            (THEME_61, 'v6.1'),
            (THEME_70, 'v7.0'),
        ]

        # FIELDS
        disabled_defaults = jsonfield.JSONField(default=[])
        enabled_specials = jsonfield.JSONField(default=[])
        site_theme = models.CharField(
            "Site Theme",
            max_length=10,
            choices=THEMES,
            default=THEME_70,
            help_text="This setting is only visible to superusers. DO NOT CHANGE THIS SETTING ON ESTABLISHED SITES."
        )

        # FORM CONFIG
        panels = [
            FieldPanel('disabled_defaults', classname='disabled-defaults'),
            FieldPanel('enabled_specials', classname='enabled-specials'),
            FieldPanel('site_theme', classname='site-theme'),
        ]
        base_form_class = FeaturesForm

        def feature_is_enabled(self, machine_name):
            """
            Returns True if the Feature with the given machine name is enabled on the associated Site.
            Since machine names cannot be shared across Special and Default features, this method works for both types.
            """
            if machine_name in registry['special']:
                return machine_name in self.enabled_specials
            elif machine_name in registry['default']:
                return machine_name not in self.disabled_defaults
            else:
                raise UnknownFeatureMachineNameError("No Feature exists with the machine name '{}'".format(machine_name))

        class Meta:
            verbose_name = 'Site Features'

Various parts of the code can register features, we don’t know all the available features at class definition time, so we need a form that will create the list at the time the form is instantiated.

    class FeaturesForm(WagtailAdminModelForm):
        css_class = "features-form rich-settings"

        disabled_defaults = forms.MultipleChoiceField(
            required=False,
            widget=forms.CheckboxSelectMultiple,
            label='Disabled Default Features',
            choices=[],
            help_text=mark_safe("Select Default Features that should be <b>disabled</b> on this Site.")
        )
        enabled_specials = forms.MultipleChoiceField(
            required=False,
            widget=forms.CheckboxSelectMultiple,
            label='Enabled Special Features',
            choices=[],
            help_text=mark_safe("Select Special Features that should be <b>enabled</b> on this Site.")
        )

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.fields['disabled_defaults'].choices = sorted(registry['default'].items())
            self.fields['enabled_specials'].choices = sorted(registry['special'].items())

Our “bare page” feature is available for a couple of different page types - so they get a “bare_page” field and then their page templates have the necessary code to remove parts of the page when that page attribute is true. The ‘bare_page’ FieldPanel is included in the panels just like a normal field, but then we have a custom form for those page types and it takes care of removing the field from the form unless the “bare page” feature is enabled for the site.

    class FlexPage(BasePage):
        # field definitions here
        bare_page = models.BooleanField(default=False, help_text="Render the page without a header or footer.")

        # Editor Panels Configuration
        flex_content_panels = [
            FieldPanel('title', classname='full title'),
            FieldPanel('body')
        ]

        flex_settings_panels = [
            MultiFieldPanel(
                heading='Page Settings',
                children=[
                    FieldPanel('slug'),
                    FieldPanel('bare_page'),
                ]
            )
        ]

        edit_handler = TabbedInterface(
            base_form_class=BarePageForm,
            children=[
                ObjectList(flex_content_panels, heading='Content'),
                ObjectList(flex_settings_panels, heading='Settings', classname='settings'),
                ObjectList(flex_publishing_panels, heading='Publishing'),
            ]
        )

--------------------------------

    class BarePageForm(WagtailAdminPageForm):

        def __init__(self, *args, **kwargs):
            """
            Starting with Wagtail 3, we do our form manipulation in the form class initializer,
            not in a get_edit_handler class method.
            """
            super().__init__(*args, **kwargs)

            request = get_current_request()
            if request and not Site.find_for_request(request).features.feature_is_enabled('bare_page'):
                del self.fields['bare_page']

Wagtail 3 Upgrade: Per User FieldPanel Permissions

I work on a long running Wagtail project and over the years we have made a lot of customizations (aka monkey patches) to the Wagtail framework. So each upgrade takes a bit of work - though often this leads to cleaner code. In my previous post about custom menu items I described needing to change one of our customizations and finding Wagtail already had a better way to do what I wanted. In other cases, things we had been doing via ugly hacks become part of Wagtail itself. For Wagtail 3, one of those is the new Permission-dependent FieldPanels.

We have event pages that are mainly used for announcing academic seminars but are also used by our Public Programming office to advertise concerts. The concert pages need to have images but we don’t really want to allow other groups to add images to their pages. Similarly, we want the editors of our main site to be able to tag events for display on the home page or on an internally facing page. We had been using a kind of ugly hack to hide the image upload and display_location fields from everyone who did not have the calendar.can_access_admin_fields permission. First we created a MultiFieldPanel to contain the items we only want certain people to see and gave it a CSS class we can use to find it:

    MultiFieldPanel(
        heading='Admin-only Fields',
        # NOTE: The 'admin-only' class is how EventPage.get_edit_handler() identifies this MultiFieldPanel.
       classname='collapsible admin-only',
        children=[
           FieldPanel('display_locations', widget=forms.CheckboxSelectMultiple),
           StreamFieldPanel('assets'),
        ]
    ),

Then we created a get_edit_handler method that uses the field’s position in the DOM and that CSS class to find and remove that field:

    @classmethod
    def get_edit_handler(cls):
        """
        We override this method (which is added to the Page class in wagtail.admin.panels) in order to enforce
        our custom field-level permissions.
        """
        # Do the same thing that wagtail.admin.panels.get_edit_handler() would do...
        bound_handler = cls.edit_handler.bind_to(model=cls)
        # ... then enforce admin-only field permissions on the result.
        current_request = get_current_request()
        # This method gets called during certain manage.py commands, so we need to be able to gracefully fail if there
        # is no current request. Thus, if there is no current request, the admin-only fields are removed.
        if current_request is None or not current_request.user.has_perm('master_calendar.can_access_admin_fields'):
            # We know for sure that bound_handler.children[0].children is the list of Panels in the Content tab.
            # We must search through that list to find the admin-only MultiFieldPanel, and remove it.
            # The [:] gets us a copy of the list, so altering the original doesn't change what we're looping over.
            for child in bound_handler.children[0].children[:]:
                if 'admin-only' in child.classname:
                    bound_handler.children[0].children.remove(child)
                    break
        return bound_handler

As of Wagtail 3, I can remove the get_edit_handler override and enforce our per user permissions in the panel definition:

    MultiFieldPanel(
        heading='Admin-only Fields',
        classname='collapsible',
        children=[
            FieldPanel('display_locations', permission='master_calendar.can_access_admin_fields',
                       widget=forms.CheckboxSelectMultiple),
            FieldPanel('assets', permission='master_calendar.can_access_admin_fields'),
        ]
    ),

If the user does not have the can_access_admin_fields permission, the two FieldPanels get removed which causes the heading for the MultiFieldPanel to disappear. How beautiful is that?

Wagtail: Dynamically Adding Admin Menu Items

I am in the process of upgrading to Wagtail 2.16. One of the new features is a slim admin menu which I am sure many of my laptop users will really like - or would really like - if I had not just added a chunk of code that violates the last item in the exceptions list: MenuItem can no longer be sub-classed to customize its HTML output or load additional JavaScript

I had had an item that was restricted to be “one page of this type per site” and so it was easy to construct a menu item to display all the subpages that could be under that page - I just need to find the PersonIndexPage2 for the current site, and then create a url for the page explorer for that page.

  class PeoplePages2MenuItems(MenuItem):
       def __init__(self):
           super().__init__(
               label="People Pages",
               url=None,
               classnames="icon icon-user",
               order=300,
           )

       def is_shown(self, request):
           """
           The PeoplePages2MenuItem is only shown if there is a PersonPage2Template in the site.
           """
           return PersonPage2Template.objects.in_site(Site.find_for_request(request)).exists()

      def get_context(self, request):
          """
          Constructs the url for listing PersonPage2 pages
          """
          page = PersonIndexPage2.objects.descendant_of(Site.find_for_request(request).root_page).first()
          self.url = reverse('wagtailadmin_explore', args=[page.id]) + "?ordering=title&people_pages_only=True"
          return super().get_context(request)


  @hooks.register('register_people_admin_menu_item')
  def register_people_pages_v2_template_menu_item():
      return PeoplePages2TemplateMenuItem()

But then someone asked me if they could add more than one PersonIndexPage2 per site. So we will need more than one menu item for “People Pages” - and we’ll need more than one link per site. So I had a look at the MenuItem class and there is the render code, just begging me to hijack it. so I removed the get_context method above and did all the dirty work in the render_html method.

      def render_html(self, request):
          pages = PersonIndexPage2.objects.descendant_of(Site.find_for_request(request).root_page).all()
          items = []
          for page in pages:
              context = self.get_context(request)
              context['url'] = reverse('wagtailadmin_explore', args=[page.id]) + "?ordering=title&people_pages_only=True"
              context['label'] = page.title
              items.append(render_to_string(self.template, context, request=request))
          return (' ').join(items)

That was great - for about 2 weeks. Then I started my Wagtail 2.16 upgrade and suddenly my “People Pages” links go to /admin/null.

So I went poking around in the Wagtail source code and found what I probably should have been using all the time. The Menu class has a method menu_items_for_request. This is where the is_shown rules are enforced - but more important for my current issue is the section where it executes any hooks registered by a menu’s construct_hook_name. I have lots of code that uses hooks configured with register_hook_name but it hadn’t occurred to me to look for a request-time equivalent.

So, first I need to define a construct hook:

  class PeopleAdminMenu(Menu):
      def __init__(self):
          super().__init__(
              register_hook_name='register_people_admin_menu_item',
              construct_hook_name='construct_people_admin_menu_item',
          )

Then I replaced my PeoplePages2MenuItems class and the register_people_admin_menu_item hook that added it to the correct top level menu item with a method to add the menu items.

  @hooks.register('construct_people_admin_menu_item')
  def add_people_pages2_menu_items(request, items):
      site = Site.find_for_request(request)
      if PersonPage2Template.objects.in_site(site).exists():
          for page in PersonIndexPage2.objects.descendant_of(site.root_page).all():
              pp2_menu_item = MenuItem(
                  page.title,
                  reverse('wagtailadmin_explore', args=[page.id]) + "?ordering=title&people_pages_only=True",
                  icon_name='icon icon-user',
                  order=300,
              )
              items.append(pp2_menu_item)

This contains all the same logic as the previous class. The if clause contains the logic from the is_shown method and the class’s init parameters are combined with the dynamic url and label items from the render_html method to instantiate a MenuItem. So much cleaner! I should have been doing it like this all along.

Trimming Wagtail Migration Cruft

Django creates migrations for Django model changes that do not alter the database, for example, changes to help text or verbose names. In my previous post, I shared code for telling Django not to track non-database attributes in its migrations. This post is about something similar for Wagtail’s migrations.

At work, we are using Wagtail as our Content Management System (CMS). The Wagtail core team decided to follow Django’s example and record all model changes in migrations - including ones that do not change the database schema. Unfortunately for us, this means that when we add new blocks to our pages, “makemigrations” thinks it should make a new version of our StreamField - even though no SQL will be run when the migration is installed. We have a lot of blocks and they change fairly frequently, so these StreamField migrations take up a lot of space. And because they are large, they are nearly impossible to diff, so even if we kept them, it would be hard to use them to track down changes to our StreamField definitions.

For the most part, we just ignore it when “manage.py migrate” tells us we have changes in our code that are not reflected in our migrations. But when we do need to create a migration for database schema changes, we either need to accept a new large chunk of code that doesn’t do anything - or we have to manually remove those lines before committing the migration to version control.

I have read the discussion about the issue on the Wagtail issue queue. And, while I tend to agree with the policy decision, I still want to see what life is like without including StreamField definitions in our migrations. So I added the following monkey patch to the app we already have for all of our monkey patches.

  # wagtail_patches/monkey_patches.py

  import wagtail.core.fields

  def deconstruct_without_block_definition(self):
      name, path, _, kwargs = super(wagtail.core.fields.StreamField, self).deconstruct()
      block_types = list()
      args = [block_types]
      return name, path, args, kwargs
  wagtail.core.fields.StreamField.deconstruct = deconstruct_without_block_definition

This is simply a copy of the StreamField deconstruct method but I replaced “block_types = self.stream_block.child_blocks.items()” with an empty list. Now any field defined as:

  body = wagtail.core.fields.StreamField([ <large list of blocks here> ])

will be represented in the migration file as the following - with no list of blocks:

  ('body', wagtail.core.fields.StreamField([]))

Then I went through all the apps in our project and squashed migrations. This automatically ‘removed’ the StreamField definitions in the migrations included in the squash. Then I manually edited any migrations prior to the current squashing to remove the StreamField definitions from them. Deploying the squashed migrations went smoothly. Now we just need to do some development and see if there is any reason to want to change our minds and start tracking StreamField definitions in our migration files once more.


Addendum

2022-04-12 Per @tbrlpld on the Wagtail Slack: overriding the StreamField deconstruct method as I did above breaks StreamField data migrations. In the context of a data migration the body field in the example above will always return an empty list - so you will not be able to iterate over it to make changes like those seen in this example from the CFPB.

At work, we have been moving away from using data migrations (which stick around and are run each time you build your test database) towards writing “one time” management commands that we run in the needed places and then delete. I haven’t had occasion to do any StreamField manipulations using this technique since we added these monkey patches. So I don’t know if we will face the same problem in that context.