Blame src/libostree/ostree-repo-static-delta-compilation-analysis.c

rpm-build 0fba15
/*
rpm-build 0fba15
 * Copyright (C) 2015 Colin Walters <walters@verbum.org>
rpm-build 0fba15
 *
rpm-build 0fba15
 * SPDX-License-Identifier: LGPL-2.0+
rpm-build 0fba15
 *
rpm-build 0fba15
 * This library is free software; you can redistribute it and/or
rpm-build 0fba15
 * modify it under the terms of the GNU Lesser General Public
rpm-build 0fba15
 * License as published by the Free Software Foundation; either
rpm-build 0fba15
 * version 2 of the License, or (at your option) any later version.
rpm-build 0fba15
 *
rpm-build 0fba15
 * This library is distributed in the hope that it will be useful,
rpm-build 0fba15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
rpm-build 0fba15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
rpm-build 0fba15
 * Lesser General Public License for more details.
rpm-build 0fba15
 *
rpm-build 0fba15
 * You should have received a copy of the GNU Lesser General Public
rpm-build 0fba15
 * License along with this library; if not, write to the
rpm-build 0fba15
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
rpm-build 0fba15
 * Boston, MA 02111-1307, USA.
rpm-build 0fba15
 */
rpm-build 0fba15
rpm-build 0fba15
#include "config.h"
rpm-build 0fba15
rpm-build 0fba15
#include <string.h>
rpm-build 0fba15
#include <gio/gunixoutputstream.h>
rpm-build 0fba15
rpm-build 0fba15
#include "ostree-core-private.h"
rpm-build 0fba15
#include "ostree-repo-private.h"
rpm-build 0fba15
#include "ostree-lzma-compressor.h"
rpm-build 0fba15
#include "ostree-repo-static-delta-private.h"
rpm-build 0fba15
#include "ostree-diff.h"
rpm-build 0fba15
#include "ostree-rollsum.h"
rpm-build 0fba15
#include "otutil.h"
rpm-build 0fba15
#include "ostree-varint.h"
rpm-build 0fba15
rpm-build 0fba15
void
rpm-build 0fba15
_ostree_delta_content_sizenames_free (gpointer v)
rpm-build 0fba15
{
rpm-build 0fba15
  OstreeDeltaContentSizeNames *ce = v;
rpm-build 0fba15
  g_free (ce->checksum);
rpm-build 0fba15
  g_ptr_array_unref (ce->basenames);
rpm-build 0fba15
  g_free (ce);
rpm-build 0fba15
}
rpm-build 0fba15
rpm-build 0fba15
static gboolean
rpm-build 0fba15
build_content_sizenames_recurse (OstreeRepo                     *repo,
rpm-build 0fba15
                                 OstreeRepoCommitTraverseIter   *iter,
rpm-build 0fba15
                                 GHashTable                     *sizenames_map,
rpm-build 0fba15
                                 GHashTable                     *include_only_objects,
rpm-build 0fba15
                                 GCancellable                   *cancellable,
rpm-build 0fba15
                                 GError                        **error)
rpm-build 0fba15
{
rpm-build 0fba15
  gboolean ret = FALSE;
rpm-build 0fba15
rpm-build 0fba15
  while (TRUE)
rpm-build 0fba15
    {
rpm-build 0fba15
      OstreeRepoCommitIterResult iterres =
rpm-build 0fba15
        ostree_repo_commit_traverse_iter_next (iter, cancellable, error);
rpm-build 0fba15
          
rpm-build 0fba15
      if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_ERROR)
rpm-build 0fba15
        goto out;
rpm-build 0fba15
      else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_END)
rpm-build 0fba15
        break;
rpm-build 0fba15
      else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_FILE)
rpm-build 0fba15
        {
rpm-build 0fba15
          char *name;
rpm-build 0fba15
          char *checksum;
rpm-build 0fba15
          OstreeDeltaContentSizeNames *csizenames;
rpm-build 0fba15
            
rpm-build 0fba15
          ostree_repo_commit_traverse_iter_get_file (iter, &name, &checksum);
rpm-build 0fba15
rpm-build 0fba15
          if (include_only_objects && !g_hash_table_contains (include_only_objects, checksum))
rpm-build 0fba15
            continue;
rpm-build 0fba15
rpm-build 0fba15
          csizenames = g_hash_table_lookup (sizenames_map, checksum);
rpm-build 0fba15
          if (!csizenames)
rpm-build 0fba15
            {
rpm-build 0fba15
              g_autoptr(GFileInfo) finfo = NULL;
rpm-build 0fba15
rpm-build 0fba15
              if (!ostree_repo_load_file (repo, checksum,
rpm-build 0fba15
                                          NULL, &finfo, NULL,
rpm-build 0fba15
                                          cancellable, error))
rpm-build 0fba15
                goto out;
rpm-build 0fba15
rpm-build 0fba15
              if (g_file_info_get_file_type (finfo) != G_FILE_TYPE_REGULAR)
rpm-build 0fba15
                continue;
rpm-build 0fba15
rpm-build 0fba15
              csizenames = g_new0 (OstreeDeltaContentSizeNames, 1);
rpm-build 0fba15
              csizenames->checksum = g_strdup (checksum);
rpm-build 0fba15
              csizenames->size = g_file_info_get_size (finfo);
rpm-build 0fba15
              g_hash_table_replace (sizenames_map, csizenames->checksum, csizenames);
rpm-build 0fba15
            }
rpm-build 0fba15
rpm-build 0fba15
          if (!csizenames->basenames)
rpm-build 0fba15
            csizenames->basenames = g_ptr_array_new_with_free_func (g_free);
rpm-build 0fba15
          g_ptr_array_add (csizenames->basenames, g_strdup (name));
rpm-build 0fba15
        }
rpm-build 0fba15
      else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_DIR)
rpm-build 0fba15
        {
rpm-build 0fba15
          char *name;
rpm-build 0fba15
          char *content_checksum;
rpm-build 0fba15
          char *meta_checksum;
rpm-build 0fba15
          g_autoptr(GVariant) dirtree = NULL;
rpm-build 0fba15
          ostree_cleanup_repo_commit_traverse_iter
rpm-build 0fba15
            OstreeRepoCommitTraverseIter subiter = { 0, };
rpm-build 0fba15
rpm-build 0fba15
          ostree_repo_commit_traverse_iter_get_dir (iter, &name, &content_checksum, &meta_checksum);
rpm-build 0fba15
          
rpm-build 0fba15
          if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_DIR_TREE,
rpm-build 0fba15
                                         content_checksum, &dirtree,
rpm-build 0fba15
                                         error))
rpm-build 0fba15
            goto out;
rpm-build 0fba15
rpm-build 0fba15
          if (!ostree_repo_commit_traverse_iter_init_dirtree (&subiter, repo, dirtree,
rpm-build 0fba15
                                                              OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
rpm-build 0fba15
                                                              error))
rpm-build 0fba15
            goto out;
rpm-build 0fba15
rpm-build 0fba15
          if (!build_content_sizenames_recurse (repo, &subiter,
rpm-build 0fba15
                                                sizenames_map, include_only_objects,
rpm-build 0fba15
                                                cancellable, error))
rpm-build 0fba15
            goto out;
rpm-build 0fba15
        }
rpm-build 0fba15
      else
rpm-build 0fba15
        g_assert_not_reached ();
rpm-build 0fba15
    }
rpm-build 0fba15
  ret = TRUE;
rpm-build 0fba15
 out:
rpm-build 0fba15
  return ret;
rpm-build 0fba15
}
rpm-build 0fba15
rpm-build 0fba15
static int
rpm-build 0fba15
compare_sizenames (const void  *a,
rpm-build 0fba15
                   const void  *b)
rpm-build 0fba15
{
rpm-build 0fba15
  OstreeDeltaContentSizeNames *sn_a = *(OstreeDeltaContentSizeNames**)(void*)a;
rpm-build 0fba15
  OstreeDeltaContentSizeNames *sn_b = *(OstreeDeltaContentSizeNames**)(void*)b;
rpm-build 0fba15
rpm-build 0fba15
  return sn_a->size - sn_b->size;
rpm-build 0fba15
}
rpm-build 0fba15
rpm-build 0fba15
/*
rpm-build 0fba15
 * Generate a sorted array of [(checksum: str, size: uint64, names: array[string]), ...]
rpm-build 0fba15
 * for regular file content.
rpm-build 0fba15
 */
rpm-build 0fba15
static gboolean
rpm-build 0fba15
build_content_sizenames_filtered (OstreeRepo              *repo,
rpm-build 0fba15
                                  GVariant                *commit,
rpm-build 0fba15
                                  GHashTable              *include_only_objects,
rpm-build 0fba15
                                  GPtrArray              **out_sizenames,
rpm-build 0fba15
                                  GCancellable            *cancellable,
rpm-build 0fba15
                                  GError                 **error)
rpm-build 0fba15
{
rpm-build 0fba15
  gboolean ret = FALSE;
rpm-build 0fba15
  g_autoptr(GPtrArray) ret_sizenames =
rpm-build 0fba15
    g_ptr_array_new_with_free_func (_ostree_delta_content_sizenames_free);
rpm-build 0fba15
  g_autoptr(GHashTable) sizenames_map =
rpm-build 0fba15
    g_hash_table_new_full (g_str_hash, g_str_equal, NULL, _ostree_delta_content_sizenames_free);
rpm-build 0fba15
  ostree_cleanup_repo_commit_traverse_iter
rpm-build 0fba15
    OstreeRepoCommitTraverseIter iter = { 0, };
rpm-build 0fba15
rpm-build 0fba15
  if (!ostree_repo_commit_traverse_iter_init_commit (&iter, repo, commit,
rpm-build 0fba15
                                                     OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
rpm-build 0fba15
                                                     error))
rpm-build 0fba15
    goto out;
rpm-build 0fba15
rpm-build 0fba15
  if (!build_content_sizenames_recurse (repo, &iter, sizenames_map, include_only_objects,
rpm-build 0fba15
                                        cancellable, error))
rpm-build 0fba15
    goto out;
rpm-build 0fba15
rpm-build 0fba15
  { GHashTableIter hashiter;
rpm-build 0fba15
    gpointer hkey, hvalue;
rpm-build 0fba15
rpm-build 0fba15
    g_hash_table_iter_init (&hashiter, sizenames_map);
rpm-build 0fba15
    while (g_hash_table_iter_next (&hashiter, &hkey, &hvalue))
rpm-build 0fba15
      {
rpm-build 0fba15
        g_hash_table_iter_steal (&hashiter);
rpm-build 0fba15
        g_ptr_array_add (ret_sizenames, hvalue);
rpm-build 0fba15
      }
rpm-build 0fba15
  }
rpm-build 0fba15
rpm-build 0fba15
  g_ptr_array_sort (ret_sizenames, compare_sizenames);
rpm-build 0fba15
rpm-build 0fba15
  ret = TRUE;
rpm-build 0fba15
  if (out_sizenames)
rpm-build 0fba15
    *out_sizenames = g_steal_pointer (&ret_sizenames);
rpm-build 0fba15
 out:
rpm-build 0fba15
  return ret;
rpm-build 0fba15
}
rpm-build 0fba15
rpm-build 0fba15
static gboolean
rpm-build 0fba15
string_array_nonempty_intersection (GPtrArray    *a,
rpm-build 0fba15
                                    GPtrArray    *b,
rpm-build 0fba15
                                    gboolean      fuzzy)
rpm-build 0fba15
{
rpm-build 0fba15
  guint i;
rpm-build 0fba15
  for (i = 0; i < a->len; i++)
rpm-build 0fba15
    {
rpm-build 0fba15
      guint j;
rpm-build 0fba15
      const char *a_str = a->pdata[i];
rpm-build 0fba15
      const char *a_dot = strchr (a_str, '.');
rpm-build 0fba15
      for (j = 0; j < b->len; j++)
rpm-build 0fba15
        {
rpm-build 0fba15
          const char *b_str = b->pdata[j];
rpm-build 0fba15
          const char *b_dot = strchr (b_str, '.');
rpm-build 0fba15
          /* When doing fuzzy comparison, just compare the part before the '.' if it exists.  */
rpm-build 0fba15
          if (fuzzy && a_dot && b_dot && b_dot - b_str && b_dot - b_str == a_dot - a_str)
rpm-build 0fba15
            {
rpm-build 0fba15
              if (strncmp (a_str, b_str, a_dot - a_str) == 0)
rpm-build 0fba15
                return TRUE;
rpm-build 0fba15
            }
rpm-build 0fba15
          else
rpm-build 0fba15
            {
rpm-build 0fba15
              if (strcmp (a_str, b_str) == 0)
rpm-build 0fba15
                return TRUE;
rpm-build 0fba15
            }
rpm-build 0fba15
        }
rpm-build 0fba15
    }
rpm-build 0fba15
  return FALSE;
rpm-build 0fba15
}
rpm-build 0fba15
rpm-build 0fba15
static gboolean
rpm-build 0fba15
sizename_is_delta_candidate (OstreeDeltaContentSizeNames *sizename)
rpm-build 0fba15
{
rpm-build 0fba15
  /* Don't build candidates for the empty object */
rpm-build 0fba15
  if (sizename->size == 0)
rpm-build 0fba15
    return FALSE;
rpm-build 0fba15
rpm-build 0fba15
  /* Look for known non-delta-able files (currently just compression like xz) */
rpm-build 0fba15
  for (guint i = 0; i < sizename->basenames->len; i++)
rpm-build 0fba15
    {
rpm-build 0fba15
      const char *name = sizename->basenames->pdata[i];
rpm-build 0fba15
      /* We could replace this down the line with g_content_type_guess() or so,
rpm-build 0fba15
       * but it's not clear to me that's a major win; we'd still need to
rpm-build 0fba15
       * maintain a list of compression formats, and this doesn't require
rpm-build 0fba15
       * allocation.
rpm-build 0fba15
       * NB: We explicitly don't have .gz here in case someone might be
rpm-build 0fba15
       * using --rsyncable for that.
rpm-build 0fba15
       */
rpm-build 0fba15
      const char *dot = strrchr (name, '.');
rpm-build 0fba15
      if (!dot)
rpm-build 0fba15
        continue;
rpm-build 0fba15
      const char *extension = dot+1;
rpm-build 0fba15
      /* Don't add .gz here, see above */
rpm-build 0fba15
      if (g_str_equal (extension, "xz") || g_str_equal (extension, "bz2"))
rpm-build 0fba15
        return FALSE;
rpm-build 0fba15
    }
rpm-build 0fba15
rpm-build 0fba15
  /* Let's try it */
rpm-build 0fba15
  return TRUE;
rpm-build 0fba15
}
rpm-build 0fba15
rpm-build 0fba15
/*
rpm-build 0fba15
 * Build up a map of files with matching basenames and similar size,
rpm-build 0fba15
 * and use it to find apparently similar objects.
rpm-build 0fba15
 *
rpm-build 0fba15
 * @new_reachable_regfile_content is a Set<checksum> of new regular
rpm-build 0fba15
 * file objects.
rpm-build 0fba15
 *
rpm-build 0fba15
 * Currently, @out_modified_regfile_content will be a Map<to checksum,from checksum>;
rpm-build 0fba15
 * however in the future it would be easy to have this function return
rpm-build 0fba15
 * multiple candidate matches.  The hard part would be changing
rpm-build 0fba15
 * the delta compiler to iterate over all matches, determine
rpm-build 0fba15
 * a cost for each one, then pick the best.
rpm-build 0fba15
 */
rpm-build 0fba15
gboolean
rpm-build 0fba15
_ostree_delta_compute_similar_objects (OstreeRepo                 *repo,
rpm-build 0fba15
                                       GVariant                   *from_commit,
rpm-build 0fba15
                                       GVariant                   *to_commit,
rpm-build 0fba15
                                       GHashTable                 *new_reachable_regfile_content,
rpm-build 0fba15
                                       guint                       similarity_percent_threshold,
rpm-build 0fba15
                                       GHashTable                **out_modified_regfile_content,
rpm-build 0fba15
                                       GCancellable               *cancellable,
rpm-build 0fba15
                                       GError                    **error)
rpm-build 0fba15
{
rpm-build 0fba15
  gboolean ret = FALSE;
rpm-build 0fba15
  g_autoptr(GHashTable) ret_modified_regfile_content =
rpm-build 0fba15
    g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
rpm-build 0fba15
  g_autoptr(GPtrArray) from_sizes = NULL;
rpm-build 0fba15
  g_autoptr(GPtrArray) to_sizes = NULL;
rpm-build 0fba15
  guint i, j;
rpm-build 0fba15
  guint lower;
rpm-build 0fba15
  guint upper;
rpm-build 0fba15
rpm-build 0fba15
  if (!build_content_sizenames_filtered (repo, from_commit, NULL,
rpm-build 0fba15
                                         &from_sizes,
rpm-build 0fba15
                                         cancellable, error))
rpm-build 0fba15
    goto out;
rpm-build 0fba15
rpm-build 0fba15
  if (!build_content_sizenames_filtered (repo, to_commit, new_reachable_regfile_content,
rpm-build 0fba15
                                         &to_sizes,
rpm-build 0fba15
                                         cancellable, error))
rpm-build 0fba15
    goto out;
rpm-build 0fba15
rpm-build 0fba15
  /* Iterate over all newly added objects, find objects which have
rpm-build 0fba15
   * similar basename and sizes.
rpm-build 0fba15
   *
rpm-build 0fba15
   * Because the arrays are sorted by size, we can maintain a `lower`
rpm-build 0fba15
   * bound on the original (from) objects to start searching.
rpm-build 0fba15
   */
rpm-build 0fba15
  lower = 0;
rpm-build 0fba15
  upper = from_sizes->len;
rpm-build 0fba15
  for (i = 0; i < to_sizes->len; i++)
rpm-build 0fba15
    {
rpm-build 0fba15
      int fuzzy;
rpm-build 0fba15
      gboolean found = FALSE;
rpm-build 0fba15
      OstreeDeltaContentSizeNames *to_sizenames = to_sizes->pdata[i];
rpm-build 0fba15
      const guint64 min_threshold = to_sizenames->size *
rpm-build 0fba15
        (1.0-similarity_percent_threshold/100.0);
rpm-build 0fba15
      const guint64 max_threshold = to_sizenames->size *
rpm-build 0fba15
        (1.0+similarity_percent_threshold/100.0);
rpm-build 0fba15
rpm-build 0fba15
      if (!sizename_is_delta_candidate (to_sizenames))
rpm-build 0fba15
        continue;
rpm-build 0fba15
rpm-build 0fba15
      for (fuzzy = 0; fuzzy < 2 && !found; fuzzy++)
rpm-build 0fba15
        {
rpm-build 0fba15
          for (j = lower; j < upper; j++)
rpm-build 0fba15
            {
rpm-build 0fba15
              OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j];
rpm-build 0fba15
              if (!sizename_is_delta_candidate (from_sizenames))
rpm-build 0fba15
                continue;
rpm-build 0fba15
rpm-build 0fba15
              if (from_sizenames->size < min_threshold)
rpm-build 0fba15
                {
rpm-build 0fba15
                  lower++;
rpm-build 0fba15
                  continue;
rpm-build 0fba15
                }
rpm-build 0fba15
rpm-build 0fba15
              if (from_sizenames->size > max_threshold)
rpm-build 0fba15
                break;
rpm-build 0fba15
rpm-build 0fba15
              if (!string_array_nonempty_intersection (from_sizenames->basenames,
rpm-build 0fba15
                                                       to_sizenames->basenames,
rpm-build 0fba15
                                                       fuzzy == 1))
rpm-build 0fba15
                {
rpm-build 0fba15
                  continue;
rpm-build 0fba15
                }
rpm-build 0fba15
rpm-build 0fba15
              /* Only one candidate right now */
rpm-build 0fba15
              g_hash_table_insert (ret_modified_regfile_content,
rpm-build 0fba15
                                   g_strdup (to_sizenames->checksum),
rpm-build 0fba15
                                   g_strdup (from_sizenames->checksum));
rpm-build 0fba15
              found = TRUE;
rpm-build 0fba15
              break;
rpm-build 0fba15
            }
rpm-build 0fba15
        }
rpm-build 0fba15
    }
rpm-build 0fba15
rpm-build 0fba15
  ret = TRUE;
rpm-build 0fba15
  if (out_modified_regfile_content)
rpm-build 0fba15
    *out_modified_regfile_content = g_steal_pointer (&ret_modified_regfile_content);
rpm-build 0fba15
 out:
rpm-build 0fba15
  return ret;
rpm-build 0fba15
}