Blob Blame History Raw
/*
 * Copyright (C) 2015 Colin Walters <walters@verbum.org>
 *
 * SPDX-License-Identifier: LGPL-2.0+
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include "config.h"

#include <string.h>
#include <gio/gunixoutputstream.h>

#include "ostree-core-private.h"
#include "ostree-repo-private.h"
#include "ostree-lzma-compressor.h"
#include "ostree-repo-static-delta-private.h"
#include "ostree-diff.h"
#include "ostree-rollsum.h"
#include "otutil.h"
#include "ostree-varint.h"

void
_ostree_delta_content_sizenames_free (gpointer v)
{
  OstreeDeltaContentSizeNames *ce = v;
  g_free (ce->checksum);
  g_ptr_array_unref (ce->basenames);
  g_free (ce);
}

static gboolean
build_content_sizenames_recurse (OstreeRepo                     *repo,
                                 OstreeRepoCommitTraverseIter   *iter,
                                 GHashTable                     *sizenames_map,
                                 GHashTable                     *include_only_objects,
                                 GCancellable                   *cancellable,
                                 GError                        **error)
{
  gboolean ret = FALSE;

  while (TRUE)
    {
      OstreeRepoCommitIterResult iterres =
        ostree_repo_commit_traverse_iter_next (iter, cancellable, error);
          
      if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_ERROR)
        goto out;
      else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_END)
        break;
      else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_FILE)
        {
          char *name;
          char *checksum;
          OstreeDeltaContentSizeNames *csizenames;
            
          ostree_repo_commit_traverse_iter_get_file (iter, &name, &checksum);

          if (include_only_objects && !g_hash_table_contains (include_only_objects, checksum))
            continue;

          csizenames = g_hash_table_lookup (sizenames_map, checksum);
          if (!csizenames)
            {
              g_autoptr(GFileInfo) finfo = NULL;

              if (!ostree_repo_load_file (repo, checksum,
                                          NULL, &finfo, NULL,
                                          cancellable, error))
                goto out;

              if (g_file_info_get_file_type (finfo) != G_FILE_TYPE_REGULAR)
                continue;

              csizenames = g_new0 (OstreeDeltaContentSizeNames, 1);
              csizenames->checksum = g_strdup (checksum);
              csizenames->size = g_file_info_get_size (finfo);
              g_hash_table_replace (sizenames_map, csizenames->checksum, csizenames);
            }

          if (!csizenames->basenames)
            csizenames->basenames = g_ptr_array_new_with_free_func (g_free);
          g_ptr_array_add (csizenames->basenames, g_strdup (name));
        }
      else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_DIR)
        {
          char *name;
          char *content_checksum;
          char *meta_checksum;
          g_autoptr(GVariant) dirtree = NULL;
          ostree_cleanup_repo_commit_traverse_iter
            OstreeRepoCommitTraverseIter subiter = { 0, };

          ostree_repo_commit_traverse_iter_get_dir (iter, &name, &content_checksum, &meta_checksum);
          
          if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_DIR_TREE,
                                         content_checksum, &dirtree,
                                         error))
            goto out;

          if (!ostree_repo_commit_traverse_iter_init_dirtree (&subiter, repo, dirtree,
                                                              OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
                                                              error))
            goto out;

          if (!build_content_sizenames_recurse (repo, &subiter,
                                                sizenames_map, include_only_objects,
                                                cancellable, error))
            goto out;
        }
      else
        g_assert_not_reached ();
    }
  ret = TRUE;
 out:
  return ret;
}

static int
compare_sizenames (const void  *a,
                   const void  *b)
{
  OstreeDeltaContentSizeNames *sn_a = *(OstreeDeltaContentSizeNames**)(void*)a;
  OstreeDeltaContentSizeNames *sn_b = *(OstreeDeltaContentSizeNames**)(void*)b;

  return sn_a->size - sn_b->size;
}

/*
 * Generate a sorted array of [(checksum: str, size: uint64, names: array[string]), ...]
 * for regular file content.
 */
static gboolean
build_content_sizenames_filtered (OstreeRepo              *repo,
                                  GVariant                *commit,
                                  GHashTable              *include_only_objects,
                                  GPtrArray              **out_sizenames,
                                  GCancellable            *cancellable,
                                  GError                 **error)
{
  gboolean ret = FALSE;
  g_autoptr(GPtrArray) ret_sizenames =
    g_ptr_array_new_with_free_func (_ostree_delta_content_sizenames_free);
  g_autoptr(GHashTable) sizenames_map =
    g_hash_table_new_full (g_str_hash, g_str_equal, NULL, _ostree_delta_content_sizenames_free);
  ostree_cleanup_repo_commit_traverse_iter
    OstreeRepoCommitTraverseIter iter = { 0, };

  if (!ostree_repo_commit_traverse_iter_init_commit (&iter, repo, commit,
                                                     OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
                                                     error))
    goto out;

  if (!build_content_sizenames_recurse (repo, &iter, sizenames_map, include_only_objects,
                                        cancellable, error))
    goto out;

  { GHashTableIter hashiter;
    gpointer hkey, hvalue;

    g_hash_table_iter_init (&hashiter, sizenames_map);
    while (g_hash_table_iter_next (&hashiter, &hkey, &hvalue))
      {
        g_hash_table_iter_steal (&hashiter);
        g_ptr_array_add (ret_sizenames, hvalue);
      }
  }

  g_ptr_array_sort (ret_sizenames, compare_sizenames);

  ret = TRUE;
  if (out_sizenames)
    *out_sizenames = g_steal_pointer (&ret_sizenames);
 out:
  return ret;
}

static gboolean
string_array_nonempty_intersection (GPtrArray    *a,
                                    GPtrArray    *b,
                                    gboolean      fuzzy)
{
  guint i;
  for (i = 0; i < a->len; i++)
    {
      guint j;
      const char *a_str = a->pdata[i];
      const char *a_dot = strchr (a_str, '.');
      for (j = 0; j < b->len; j++)
        {
          const char *b_str = b->pdata[j];
          const char *b_dot = strchr (b_str, '.');
          /* When doing fuzzy comparison, just compare the part before the '.' if it exists.  */
          if (fuzzy && a_dot && b_dot && b_dot - b_str && b_dot - b_str == a_dot - a_str)
            {
              if (strncmp (a_str, b_str, a_dot - a_str) == 0)
                return TRUE;
            }
          else
            {
              if (strcmp (a_str, b_str) == 0)
                return TRUE;
            }
        }
    }
  return FALSE;
}

static gboolean
sizename_is_delta_candidate (OstreeDeltaContentSizeNames *sizename)
{
  /* Don't build candidates for the empty object */
  if (sizename->size == 0)
    return FALSE;

  /* Look for known non-delta-able files (currently just compression like xz) */
  for (guint i = 0; i < sizename->basenames->len; i++)
    {
      const char *name = sizename->basenames->pdata[i];
      /* We could replace this down the line with g_content_type_guess() or so,
       * but it's not clear to me that's a major win; we'd still need to
       * maintain a list of compression formats, and this doesn't require
       * allocation.
       * NB: We explicitly don't have .gz here in case someone might be
       * using --rsyncable for that.
       */
      const char *dot = strrchr (name, '.');
      if (!dot)
        continue;
      const char *extension = dot+1;
      /* Don't add .gz here, see above */
      if (g_str_equal (extension, "xz") || g_str_equal (extension, "bz2"))
        return FALSE;
    }

  /* Let's try it */
  return TRUE;
}

/*
 * Build up a map of files with matching basenames and similar size,
 * and use it to find apparently similar objects.
 *
 * @new_reachable_regfile_content is a Set<checksum> of new regular
 * file objects.
 *
 * Currently, @out_modified_regfile_content will be a Map<to checksum,from checksum>;
 * however in the future it would be easy to have this function return
 * multiple candidate matches.  The hard part would be changing
 * the delta compiler to iterate over all matches, determine
 * a cost for each one, then pick the best.
 */
gboolean
_ostree_delta_compute_similar_objects (OstreeRepo                 *repo,
                                       GVariant                   *from_commit,
                                       GVariant                   *to_commit,
                                       GHashTable                 *new_reachable_regfile_content,
                                       guint                       similarity_percent_threshold,
                                       GHashTable                **out_modified_regfile_content,
                                       GCancellable               *cancellable,
                                       GError                    **error)
{
  gboolean ret = FALSE;
  g_autoptr(GHashTable) ret_modified_regfile_content =
    g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
  g_autoptr(GPtrArray) from_sizes = NULL;
  g_autoptr(GPtrArray) to_sizes = NULL;
  guint i, j;
  guint lower;
  guint upper;

  if (!build_content_sizenames_filtered (repo, from_commit, NULL,
                                         &from_sizes,
                                         cancellable, error))
    goto out;

  if (!build_content_sizenames_filtered (repo, to_commit, new_reachable_regfile_content,
                                         &to_sizes,
                                         cancellable, error))
    goto out;

  /* Iterate over all newly added objects, find objects which have
   * similar basename and sizes.
   *
   * Because the arrays are sorted by size, we can maintain a `lower`
   * bound on the original (from) objects to start searching.
   */
  lower = 0;
  upper = from_sizes->len;
  for (i = 0; i < to_sizes->len; i++)
    {
      int fuzzy;
      gboolean found = FALSE;
      OstreeDeltaContentSizeNames *to_sizenames = to_sizes->pdata[i];
      const guint64 min_threshold = to_sizenames->size *
        (1.0-similarity_percent_threshold/100.0);
      const guint64 max_threshold = to_sizenames->size *
        (1.0+similarity_percent_threshold/100.0);

      if (!sizename_is_delta_candidate (to_sizenames))
        continue;

      for (fuzzy = 0; fuzzy < 2 && !found; fuzzy++)
        {
          for (j = lower; j < upper; j++)
            {
              OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j];
              if (!sizename_is_delta_candidate (from_sizenames))
                continue;

              if (from_sizenames->size < min_threshold)
                {
                  lower++;
                  continue;
                }

              if (from_sizenames->size > max_threshold)
                break;

              if (!string_array_nonempty_intersection (from_sizenames->basenames,
                                                       to_sizenames->basenames,
                                                       fuzzy == 1))
                {
                  continue;
                }

              /* Only one candidate right now */
              g_hash_table_insert (ret_modified_regfile_content,
                                   g_strdup (to_sizenames->checksum),
                                   g_strdup (from_sizenames->checksum));
              found = TRUE;
              break;
            }
        }
    }

  ret = TRUE;
  if (out_modified_regfile_content)
    *out_modified_regfile_content = g_steal_pointer (&ret_modified_regfile_content);
 out:
  return ret;
}