fetch.py: Factor out _get_uris

Alec Warner Sun, 19 Jan 2014 14:37:47 -0800

On Sat, Jan 18, 2014 at 7:07 PM, W. Trevor King <wk...@tremily.us> wrote:


> The current fetch() function is quite long, which makes it hard to
> know what I can change without adverse side effects.  By pulling this
> logic out of the main function, we get clearer logic in fetch() and
> more explicit input for the config extraction.
>
> This block was especially complicated, so I also created the helper
> functions _get_file_uri_tuples and _expand_mirror.  I'd prefer if
> _expand_mirror iterated through URIs instead of (group, URI) tuples,
> but we need a distinct marker for third-party URIs to build
> third_party_mirror_uris which is used to build primaryuri_dict which
> is used way down in fetch():
>
>   if checksum_failure_count == \
>       checksum_failure_primaryuri:
>       # Switch to "primaryuri" mode in order
>       # to increase the probablility of
>       # of success.
>       primaryuris = \
>           primaryuri_dict.get(myfile)
>           if primaryuris:
>               uri_list.extend(
>                   reversed(primaryuris))
>
> I don't know if this is useful enough to motivate the uglier
> _expandmirror return values, but I'll kick that can down the road for
> now.
> ---
>  pym/portage/package/ebuild/fetch.py | 197
> +++++++++++++++++++++---------------
>  1 file changed, 117 insertions(+), 80 deletions(-)
>
> diff --git a/pym/portage/package/ebuild/fetch.py
> b/pym/portage/package/ebuild/fetch.py
> index bd572fa..a617945 100644
> --- a/pym/portage/package/ebuild/fetch.py
> +++ b/pym/portage/package/ebuild/fetch.py
> @@ -15,9 +15,9 @@ import sys
>  import tempfile
>
>  try:
> -       from urllib.parse import urlparse
> +       from urllib.parse import urlparse, urlunparse
>  except ImportError:
> -       from urlparse import urlparse
> +       from urlparse import urlparse, urlunparse
>
>  import portage
>  portage.proxy.lazyimport.lazyimport(globals(),
> @@ -297,6 +297,118 @@ def _get_fetch_resume_size(settings, default='350K'):
>         return v
>
>
> +def _get_file_uri_tuples(uris):
> +       """Return a list of (filename, uri) tuples
> +       """
>

As mike noted on another thread:

""Return a list of (filename, uri) tuples."""

+       file_uri_tuples = []
> +       # Check for 'items' attribute since OrderedDict is not a dict.
> +       if hasattr(uris, 'items'):
> +               for filename, uri_set in uris.items():
> +                       for uri in uri_set:
> +                               file_uri_tuples.append((filename, uri))
> +                       if not uri_set:
> +                               file_uri_tuples.append((filename, None))
> +       else:
> +               for uri in uris:
> +                       if urlparse(uri).scheme:
> +                               file_uri_tuples.append(
> +                                       (os.path.basename(myuri), myuri))
> +                       else:
> +                               file_uri_tuples.append(
> +                                       (os.path.basename(myuri), None))
> +       return file_uri_tuples
> +
> +
> +def _expand_mirror(uri, custom_mirrors=(), third_party_mirrors=()):
> +       """Replace the 'mirror://' scheme in the uri
>

Sentence should end in a period.


> +
> +       Returns an iterable listing expanded (group, URI) tuples,
> +       where the group is either 'custom' or 'third-party'.
> +        """
> +       parsed = urlparse(uri)
> +       mirror = parsed.netloc
> +       path = parsed.path
> +       if path:
> +               # Try user-defined mirrors first
> +               if mirror in custom_mirrors:
> +                       for cmirr in custom_mirrors[mirror]:
> +                               m_uri = urlparse(cmirr)
> +                               yield ('custom', urlunparse((
> +                                       cmirr.scheme, cmirr.netloc, path) +
> +                                       parsed[3:]))
> +
> +               # now try the official mirrors
> +               if mirror in third_party_mirrors:
> +                       uris = []
> +                       for locmirr in third_party_mirrors[mirror]:
> +                               m_uri = urlparse(cmirr)
> +                               uris.append(urlunparse((
> +                                       cmirr.scheme, cmirr.netloc, path) +
> +                                       parsed[3:]))
> +                       random.shuffle(uris)
> +                       for uri in uris:
> +                               yield ('third-party', uri)
> +       else:
> +               writemsg(_("Invalid mirror definition in SRC_URI:\n"),
> +                        noiselevel=-1)
> +               writemsg("  %s\n" % (uri), noiselevel=-1)


Is this a py3k thing? You should be able to write:

writemsg("  %s\n" % uri, noiselevel=-1)

The tuple is only needed for multiple arguments in py2k.



+
> +
> +def _get_uris(uris, settings, custom_mirrors=(), locations=()):
>

I want you to be careful here. This is bad style when you are constructing
mutable objects in parameter defaults:

>>> def func(a=[]):
...   a.append(5)
...
>>> del func
>>> def func(a=[]):
...   a.append(5)
...   print a
...
>>> func()
[5]
>>> func()
[5, 5]
>>> func()
[5, 5, 5]
>>> func()
[5, 5, 5, 5]
>>> func()
[5, 5, 5, 5, 5]

That doesn't function as most would expect, because it turns out that 'a'
is constructed at function definition time (not call time) and is re-used
between function calls.

Now of course, your code is constructing tuples, which are immutable. That
is handy.

I'm not sure if I appreciate that more or less than the other form....

def _get_uris(uris, settings, custom_mirrors=None, locations=None):
  if not custom_mirrors:
    custom_mirrors = ()
  if not locations:
    locations = ()

Another advisable way to write it is to simply not have default arguments,
and to force the caller to sync in an empty iterable:

def_get_uris(uris, settings, custom_mirrors, locations):
...

and so they will be forced to create the empty iterables at the call site.

A brief discussion with Sebastian on irc seemed to indicate that he thought
creating immutable objects in this way was permissible (we certainly do it
often for strings) so I won't raise a fuss. I merely want to point out that
you really need to be aware of whether the objects are mutable or immutable.


> +       third_party_mirrors = settings.thirdpartymirrors()
>

Why pass in all of settings? If you only need settings.thirdpartymirrors,
then just ask for 'a thirdparty mirrors iterable' or whatever it is. You do
not want the entire settings object, it is full of disgusting globals, and
it will only screw you over later.

Think about testing this function. Do you really want to try to construct
some sort of 'testing' settings object, or simply construct a list?


> +       third_party_mirror_uris = {}
> +       filedict = OrderedDict()
> +       primaryuri_dict = {}
> +       for filename, uri in _get_file_uri_tuples(uris=uris):
> +               if filename not in filedict:
> +                       filedict[filename] = [
> +                               os.path.join(location, 'distfiles',
> filename)
> +                               for location in locations]
> +               if uri is None:
> +                       continue
> +               if uri.startswith('mirror://'):
> +                       uris = _expand_mirror(
>

too many uri / uris variables...

can we called this 'expanded_uris'?

I'm really having trouble tracking what is what. Remember that 'uris' is
already a parameter to this function. Are you shadowing it here, or trying
to overwrite it?


> +                               uri=uri, custom_mirrors=custom_mirrors,
> +                               third_party_mirrors=third_party_mirrors)
> +                       filedict[filename].extend(uri for group, uri in
> uris)
>

group appears unused in your implicit iterable here.

perhaps:

filedict[filename].extend(uri for _, uri in uris)


> +                       third_party_mirror_uris.setdefault(filename,
> []).extend(
> +                               uri for group, uri in uris
> +                               if group == 'third-party')
>

I'm curious if this matters. We are iterator over 'uris' twice. Is it
cheaper to do it once and build the iterables once?

third_party_uris = []
for group, uri in uris:
  if group == 'third_party':
    third_party_uris.append(uri)
  filedict[filename].append(uri)

I'm guessing the iterable is so small it doesn't matter.



> +                       if not filedict[filename]:
> +                               writemsg(
> +                                       _("No known mirror by the name:
> %s\n")
> +                                       % (mirror,))
> +               else:
> +                       if restrict_fetch or force_mirror:
>

Are these globals or am I missing somethng?


> +                               # Only fetch from specific mirrors is
> allowed.
> +                               continue
> +                       primaryuris = primaryuri_dict.get(filename)
> +                       if primaryuris is None:
> +                               primaryuris = []
> +                               primaryuri_dict[filename] = primaryuris
> +                       primaryuris.append(uri)
> +
> +       # Order primaryuri_dict values to match that in SRC_URI.
> +       for uris in primaryuri_dict.values():
> +               uris.reverse()
>

Is there any guaranteed ordering for dict.values()? I thought dict order
was random (and they seriously make it random in modern python versions, to
detect bugs.) How does this uris.reverse() match the SRC_URI ordering?


> +
> +       # Prefer third_party_mirrors over normal mirrors in cases when
> +       # the file does not yet exist on the normal mirrors.
> +       for filename, uris in third_party_mirror_uris.items():
> +               primaryuri_dict.setdefault(filename, []).extend(uris)
> +
> +       # Now merge primaryuri values into filedict (includes mirrors
> +       # explicitly referenced in SRC_URI).
> +       if "primaryuri" in restrict:
>

same questoin here about where 'restrict' comes from.


> +               for filename, uris in filedict.items():
> +                       filedict[filename] = primaryuri_dict.get(filename,
> []) + uris
> +       else:
> +               for filename in filedict:
> +                       filedict[filename] +=
> primaryuri_dict.get(filename, [])
> +
> +       return filedict, primaryuri_dict
> +
> +
>  def fetch(myuris, mysettings, listonly=0, fetchonly=0,
>         locks_in_subdir=".locks", use_locks=1, try_mirrors=1, digests=None,
>         allow_missing_digests=True):
> @@ -328,7 +440,6 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
>         # couple of checksum failures, to increase the probablility
>         # of success before checksum_failure_max_tries is reached.
>         checksum_failure_primaryuri = 2
> -       thirdpartymirrors = mysettings.thirdpartymirrors()
>
>         # In the background parallel-fetch process, it's safe to skip
> checksum
>         # verification of pre-existing files in $DISTDIR that have the
> correct
> @@ -412,83 +523,9 @@ def fetch(myuris, mysettings, listonly=0, fetchonly=0,
>         else:
>                 locations = mymirrors
>
> -       file_uri_tuples = []
> -       # Check for 'items' attribute since OrderedDict is not a dict.
> -       if hasattr(myuris, 'items'):
> -               for myfile, uri_set in myuris.items():
> -                       for myuri in uri_set:
> -                               file_uri_tuples.append((myfile, myuri))
> -                       if not uri_set:
> -                               file_uri_tuples.append((myfile, None))
> -       else:
> -               for myuri in myuris:
> -                       if urlparse(myuri).scheme:
> -
> file_uri_tuples.append((os.path.basename(myuri), myuri))
> -                       else:
> -
> file_uri_tuples.append((os.path.basename(myuri), None))
> -
> -       filedict = OrderedDict()
> -       primaryuri_dict = {}
> -       thirdpartymirror_uris = {}
> -       for myfile, myuri in file_uri_tuples:
> -               if myfile not in filedict:
> -                       filedict[myfile]=[]
> -                       for y in range(0,len(locations)):
> -
> filedict[myfile].append(locations[y]+"/distfiles/"+myfile)
> -               if myuri is None:
> -                       continue
> -               if myuri[:9]=="mirror://":
> -                       eidx = myuri.find("/", 9)
> -                       if eidx != -1:
> -                               mirrorname = myuri[9:eidx]
> -                               path = myuri[eidx+1:]
> -
> -                               # Try user-defined mirrors first
> -                               if mirrorname in custommirrors:
> -                                       for cmirr in
> custommirrors[mirrorname]:
> -                                               filedict[myfile].append(
> -                                                       cmirr.rstrip("/")
> + "/" + path)
> -
> -                               # now try the official mirrors
> -                               if mirrorname in thirdpartymirrors:
> -                                       uris = [locmirr.rstrip("/") + "/"
> + path \
> -                                               for locmirr in
> thirdpartymirrors[mirrorname]]
> -                                       random.shuffle(uris)
> -                                       filedict[myfile].extend(uris)
> -
> thirdpartymirror_uris.setdefault(myfile, []).extend(uris)
> -
> -                               if not filedict[myfile]:
> -                                       writemsg(_("No known mirror by the
> name: %s\n") % (mirrorname))
> -                       else:
> -                               writemsg(_("Invalid mirror definition in
> SRC_URI:\n"), noiselevel=-1)
> -                               writemsg("  %s\n" % (myuri), noiselevel=-1)
> -               else:
> -                       if restrict_fetch or force_mirror:
> -                               # Only fetch from specific mirrors is
> allowed.
> -                               continue
> -                       primaryuris = primaryuri_dict.get(myfile)
> -                       if primaryuris is None:
> -                               primaryuris = []
> -                               primaryuri_dict[myfile] = primaryuris
> -                       primaryuris.append(myuri)
> -
> -       # Order primaryuri_dict values to match that in SRC_URI.
> -       for uris in primaryuri_dict.values():
> -               uris.reverse()
> -
> -       # Prefer thirdpartymirrors over normal mirrors in cases when
> -       # the file does not yet exist on the normal mirrors.
> -       for myfile, uris in thirdpartymirror_uris.items():
> -               primaryuri_dict.setdefault(myfile, []).extend(uris)
> -
> -       # Now merge primaryuri values into filedict (includes mirrors
> -       # explicitly referenced in SRC_URI).
> -       if "primaryuri" in restrict:
> -               for myfile, uris in filedict.items():
> -                       filedict[myfile] = primaryuri_dict.get(myfile, [])
> + uris
> -       else:
> -               for myfile in filedict:
> -                       filedict[myfile] += primaryuri_dict.get(myfile, [])
> +       filedict, primaryuri_dict = _get_uris(
> +               uris=myuris, settings=mysettings,
> +               custom_mirrors=custom_mirrors, locations=locations)
>
>         can_fetch=True
>
> --
> 1.8.5.2.8.g0f6c0d1
>
>
>

Re: [gentoo-portage-dev] [PATCH 3/3] pym/portage/package/ebuild/fetch.py: Factor out _get_uris

Reply via email to