[GitHub] spark pull request: [Spark-6839] BlockManger.dataDeserialize must ...

squito Mon, 13 Apr 2015 14:03:30 -0700

Github user squito commented on a diff in the pull request:

    https://github.com/apache/spark/pull/5463#discussion_r28278804
  
    --- Diff: core/src/main/scala/org/apache/spark/storage/BlockManager.scala 
---
    @@ -755,104 +769,115 @@ private[spark] class BlockManager(
           case _ => null
         }
     
    -    putBlockInfo.synchronized {
    -      logTrace("Put for block %s took %s to get into synchronized block"
    -        .format(blockId, Utils.getUsedTimeMs(startTimeMs)))
    -
    -      var marked = false
    -      try {
    -        // returnValues - Whether to return the values put
    -        // blockStore - The type of storage to put these values into
    -        val (returnValues, blockStore: BlockStore) = {
    -          if (putLevel.useMemory) {
    -            // Put it in memory first, even if it also has useDisk set to 
true;
    -            // We will drop it to disk later if the memory store can't 
hold it.
    -            (true, memoryStore)
    -          } else if (putLevel.useOffHeap) {
    -            // Use tachyon for off-heap storage
    -            (false, tachyonStore)
    -          } else if (putLevel.useDisk) {
    -            // Don't get back the bytes from put unless we replicate them
    -            (putLevel.replication > 1, diskStore)
    -          } else {
    -            assert(putLevel == StorageLevel.NONE)
    -            throw new BlockException(
    -              blockId, s"Attempted to put block $blockId without 
specifying storage level!")
    +    try {
    +      putBlockInfo.synchronized {
    +        logTrace("Put for block %s took %s to get into synchronized block"
    +          .format(blockId, Utils.getUsedTimeMs(startTimeMs)))
    +
    +        var marked = false
    +        try {
    +          // returnValues - Whether to return the values put
    +          // blockStore - The type of storage to put these values into
    +          val (returnValues, blockStore: BlockStore) = {
    +            if (putLevel.useMemory) {
    +              // Put it in memory first, even if it also has useDisk set 
to true;
    +              // We will drop it to disk later if the memory store can't 
hold it.
    +              (true, memoryStore)
    +            } else if (putLevel.useOffHeap) {
    +              // Use tachyon for off-heap storage
    +              (false, tachyonStore)
    +            } else if (putLevel.useDisk) {
    +              // Don't get back the bytes from put unless we replicate them
    +              (putLevel.replication > 1, diskStore)
    +            } else {
    +              assert(putLevel == StorageLevel.NONE)
    +              throw new BlockException(
    +                blockId, s"Attempted to put block $blockId without 
specifying storage level!")
    +            }
               }
    -        }
     
    -        // Actually put the values
    -        val result = data match {
    -          case IteratorValues(iterator) =>
    -            blockStore.putIterator(blockId, iterator, putLevel, 
returnValues)
    -          case ArrayValues(array) =>
    -            blockStore.putArray(blockId, array, putLevel, returnValues)
    -          case ByteBufferValues(bytes) =>
    -            bytes.rewind()
    -            blockStore.putBytes(blockId, bytes, putLevel)
    -        }
    -        size = result.size
    -        result.data match {
    -          case Left (newIterator) if putLevel.useMemory => valuesAfterPut 
= newIterator
    -          case Right (newBytes) => bytesAfterPut = newBytes
    -          case _ =>
    -        }
    +          // Actually put the values
    +          val result = data match {
    +            case IteratorValues(iterator) =>
    +              blockStore.putIterator(blockId, iterator, putLevel, 
returnValues)
    +            case ArrayValues(array) =>
    +              blockStore.putArray(blockId, array, putLevel, returnValues)
    +            case ByteBufferValues(bytes) =>
    +              bytes.rewind()
    +              blockStore.putBytes(blockId, bytes, putLevel, 
resourceCleaner)
    +          }
    +          size = result.size
    +          result.data match {
    +            case Left(newIterator) if putLevel.useMemory => valuesAfterPut 
= newIterator
    +            case Right(newBytes) => bytesAfterPut = newBytes
    +            case _ =>
    +          }
     
    -        // Keep track of which blocks are dropped from memory
    -        if (putLevel.useMemory) {
    -          result.droppedBlocks.foreach { updatedBlocks += _ }
    -        }
    +          // Keep track of which blocks are dropped from memory
    +          if (putLevel.useMemory) {
    +            result.droppedBlocks.foreach {
    +              updatedBlocks += _
    +            }
    +          }
     
    -        val putBlockStatus = getCurrentBlockStatus(blockId, putBlockInfo)
    -        if (putBlockStatus.storageLevel != StorageLevel.NONE) {
    -          // Now that the block is in either the memory, tachyon, or disk 
store,
    -          // let other threads read it, and tell the master about it.
    -          marked = true
    -          putBlockInfo.markReady(size)
    -          if (tellMaster) {
    -            reportBlockStatus(blockId, putBlockInfo, putBlockStatus)
    +          val putBlockStatus = getCurrentBlockStatus(blockId, putBlockInfo)
    +          if (putBlockStatus.storageLevel != StorageLevel.NONE) {
    +            // Now that the block is in either the memory, tachyon, or 
disk store,
    +            // let other threads read it, and tell the master about it.
    +            marked = true
    +            putBlockInfo.markReady(size)
    +            if (tellMaster) {
    +              reportBlockStatus(blockId, putBlockInfo, putBlockStatus)
    +            }
    +            updatedBlocks += ((blockId, putBlockStatus))
    +          }
    +        } finally {
    +          // If we failed in putting the block to memory/disk, notify 
other possible readers
    +          // that it has failed, and then remove it from the block info 
map.
    +          if (!marked) {
    +            // Note that the remove must happen before markFailure 
otherwise another thread
    +            // could've inserted a new BlockInfo before we remove it.
    +            blockInfo.remove(blockId)
    +            putBlockInfo.markFailure()
    +            logWarning(s"Putting block $blockId failed")
               }
    -          updatedBlocks += ((blockId, putBlockStatus))
    -        }
    -      } finally {
    -        // If we failed in putting the block to memory/disk, notify other 
possible readers
    -        // that it has failed, and then remove it from the block info map.
    -        if (!marked) {
    -          // Note that the remove must happen before markFailure otherwise 
another thread
    -          // could've inserted a new BlockInfo before we remove it.
    -          blockInfo.remove(blockId)
    -          putBlockInfo.markFailure()
    -          logWarning(s"Putting block $blockId failed")
             }
           }
    -    }
    -    logDebug("Put block %s locally took %s".format(blockId, 
Utils.getUsedTimeMs(startTimeMs)))
    +      logDebug("Put block %s locally took %s".format(blockId, 
Utils.getUsedTimeMs(startTimeMs)))
     
    -    // Either we're storing bytes and we asynchronously started 
replication, or we're storing
    -    // values and need to serialize and replicate them now:
    -    if (putLevel.replication > 1) {
    -      data match {
    -        case ByteBufferValues(bytes) =>
    -          if (replicationFuture != null) {
    -            Await.ready(replicationFuture, Duration.Inf)
    -          }
    -        case _ =>
    -          val remoteStartTime = System.currentTimeMillis
    -          // Serialize the block if not already done
    -          if (bytesAfterPut == null) {
    -            if (valuesAfterPut == null) {
    -              throw new SparkException(
    -                "Underlying put returned neither an Iterator nor bytes! 
This shouldn't happen.")
    +      // Either we're storing bytes and we asynchronously started 
replication, or we're storing
    +      // values and need to serialize and replicate them now:
    +      if (putLevel.replication > 1) {
    +        data match {
    +          case ByteBufferValues(bytes) =>
    +            if (replicationFuture != null) {
    +              Await.ready(replicationFuture, Duration.Inf)
                 }
    -            bytesAfterPut = dataSerialize(blockId, valuesAfterPut)
    -          }
    -          replicate(blockId, bytesAfterPut, putLevel)
    -          logDebug("Put block %s remotely took %s"
    -            .format(blockId, Utils.getUsedTimeMs(remoteStartTime)))
    +          case _ =>
    +            val remoteStartTime = System.currentTimeMillis
    +            // Serialize the block if not already done
    +            if (bytesAfterPut == null) {
    +              if (valuesAfterPut == null) {
    +                throw new SparkException(
    +                  "Underlying put returned neither an Iterator nor bytes! 
This shouldn't happen.")
    +              }
    +              bytesAfterPut = dataSerialize(blockId, valuesAfterPut)
    +            }
    +            replicate(blockId, bytesAfterPut, putLevel)
    +            logDebug("Put block %s remotely took %s"
    +              .format(blockId, Utils.getUsedTimeMs(remoteStartTime)))
    +        }
           }
    -    }
     
    -    BlockManager.dispose(bytesAfterPut)
    +      BlockManager.dispose(bytesAfterPut)
    +    } finally {
    +      // this is to clean up the byte buffer from the *input* (as opposed 
to the line above, which
    +      // disposes the byte buffer that is the *result* of the put).  We 
might have turned that byte
    +      // buffer into an iterator of values, in which case the input 
ByteBuffer should be disposed.
    +      // It will automatically get disposed when we get to the end of the 
iterator, but in case
    +      // there is some exception before we do, this will take care of it
    +      resourceCleaner.doCleanup()
    --- End diff --
    
    this is probably the only case that is really unusual, and worth getting 
somebody else to look at.  A call to `BlockManager.doPut` *might* try to 
deserialize a ByteBuffer its given (if storageLevel is memory deserialized), 
and then it *might* read to the end of that Iterator (if there is enough memory 
to read to the end of the iterator, *or* if the storage levels allows dropping 
to disk).  If both of those are true, the old code would `dispose` the input 
byte buffer.  If not, then it never would.
    
    So the issue here is not so much that we're worried about exceptions from 
user code (since the iterator is never exposed to the user), but just being 
consistent on whether or not we dispose the input byte buffer.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [Spark-6839] BlockManger.dataDeserialize must ...

Reply via email to