-
Notifications
You must be signed in to change notification settings - Fork 607
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of Files.walk on the JVM #3383
Changes from 1 commit
1f07787
e9bf35a
81eee78
18107f4
9a915eb
a5bd763
c7358a6
09b95f8
73be7b9
64987f7
6b171f5
375942e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ package io | |
package file | ||
|
||
import cats.effect.kernel.{Async, Resource, Sync} | ||
import cats.effect.std.Dispatcher | ||
import cats.syntax.all._ | ||
|
||
import java.nio.channels.{FileChannel, SeekableByteChannel} | ||
|
@@ -32,15 +33,16 @@ import java.nio.file.attribute.{ | |
BasicFileAttributeView, | ||
BasicFileAttributes => JBasicFileAttributes, | ||
PosixFileAttributes => JPosixFileAttributes, | ||
PosixFilePermissions | ||
PosixFilePermissions, | ||
FileTime | ||
} | ||
import java.security.Principal | ||
import java.util.stream.{Stream => JStream} | ||
|
||
import scala.concurrent.duration._ | ||
|
||
import fs2.concurrent.Channel | ||
import fs2.io.CollectionCompat._ | ||
import java.nio.file.attribute.FileTime | ||
|
||
private[file] trait FilesPlatform[F[_]] extends DeprecatedFilesApi[F] { self: Files[F] => | ||
|
||
|
@@ -389,6 +391,54 @@ private[file] trait FilesCompanionPlatform { | |
.resource(Resource.fromAutoCloseable(javaCollection)) | ||
.flatMap(ds => Stream.fromBlockingIterator[F](collectionIterator(ds), pathStreamChunkSize)) | ||
|
||
override def walk(start: Path, maxDepth: Int, followLinks: Boolean): Stream[F, Path] = | ||
Stream.resource(Dispatcher.sequential[F]).flatMap { dispatcher => | ||
Stream.eval(Channel.bounded[F, Chunk[Path]](10)).flatMap { channel => | ||
val doWalk = Sync[F].interruptibleMany { | ||
val bldr = Vector.newBuilder[Path] | ||
val limit = 4096 | ||
var size = 0 | ||
bldr.sizeHint(limit) | ||
JFiles.walkFileTree( | ||
start.toNioPath, | ||
if (followLinks) Set(FileVisitOption.FOLLOW_LINKS).asJava else Set.empty.asJava, | ||
maxDepth, | ||
new SimpleFileVisitor[JPath] { | ||
private def enqueue(path: JPath): FileVisitResult = { | ||
bldr += Path.fromNioPath(path) | ||
size += 1 | ||
if (size >= limit) { | ||
val result = dispatcher.unsafeRunSync(channel.send(Chunk.from(bldr.result()))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really wish there were a way to suspend the visitation and continue it later. That would allow us to avoid the Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even eagerly collecting everything is only 5% faster than the channel based solution (using the 4096 limit): def walkEager(start: Path, maxDepth: Int, followLinks: Boolean): Stream[F, Path] = {
val doWalk = Sync[F].interruptibleMany {
val bldr = Vector.newBuilder[Path]
JFiles.walkFileTree(
start.toNioPath,
if (followLinks) Set(FileVisitOption.FOLLOW_LINKS).asJava else Set.empty.asJava,
maxDepth,
new SimpleFileVisitor[JPath] {
private def enqueue(path: JPath): FileVisitResult = {
bldr += Path.fromNioPath(path)
FileVisitResult.CONTINUE
}
override def visitFile(file: JPath, attrs: JBasicFileAttributes): FileVisitResult =
enqueue(file)
override def visitFileFailed(file: JPath, t: IOException): FileVisitResult =
FileVisitResult.CONTINUE
override def preVisitDirectory(dir: JPath, attrs: JBasicFileAttributes): FileVisitResult =
enqueue(dir)
override def postVisitDirectory(dir: JPath, t: IOException): FileVisitResult =
FileVisitResult.CONTINUE
}
)
Chunk.from(bldr.result())
}
Stream.eval(doWalk).flatMap(Stream.chunk)
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow, that's wild honestly. Have to ponder that. It's nice that we can just be lazy about our thread blocking though, since it simplifies this stuff. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @djspiewak BTW, there's a bunch of performance hackery in the JDK's file walking that's not (directly) available to us if we implement our own walk. For example, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to close this out, I tried this prototype: override def walk(
start: Path,
maxDepth: Int,
followLinks: Boolean,
chunkSize: Int
): Stream[F, Path] =
walkJustInTime(start, maxDepth, followLinks, chunkSize)
// if (chunkSize == Int.MaxValue) walkEager(start, maxDepth, followLinks)
// else walkLazy(start, maxDepth, followLinks, chunkSize)
private def walkJustInTime(
start: Path,
maxDepth: Int,
followLinks: Boolean,
chunkSize: Int
): Stream[F, Path] = {
def loop(acc: Vector[Path], toWalk: Vector[Path]): Stream[F, Path] = {
if (toWalk.isEmpty) {
Stream.chunk(Chunk.from(acc))
} else {
val path = toWalk.head
val (toEmit, newAcc) =
if (acc.size + 1 >= chunkSize)
(Chunk.from(acc :+ path), Vector.empty)
else (Chunk.empty, acc :+ path)
val list = Sync[F].interruptibleMany {
val npath = path.toNioPath
if (JFiles.isDirectory(npath)) {
val listing = JFiles.list(npath)
try listing.iterator.asScala.map(Path.fromNioPath).toVector
finally listing.close()
}
else Vector.empty
}
Stream.chunk(toEmit) ++ Stream.eval(list).flatMap(descendants => loop(newAcc, toWalk.drop(1) ++ descendants))
}
}
loop(Vector.empty, Vector(start))
} Using
Whereas the implementation in this PR results in:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's a better prototype that does file attribute reading at the time of directory listing. asdf private def walkJustInTime(
start: Path,
maxDepth: Int,
followLinks: Boolean,
chunkSize: Int
): Stream[F, Path] = {
def loop(acc: Vector[Path], toWalk: Vector[(Path, JBasicFileAttributes)]): Stream[F, Path] = {
if (toWalk.isEmpty) {
Stream.chunk(Chunk.from(acc))
} else {
val (path, attr) = toWalk.head
val (toEmit, newAcc) =
if (acc.size + 1 >= chunkSize)
(Chunk.from(acc :+ path), Vector.empty)
else (Chunk.empty, acc :+ path)
if (attr.isDirectory) {
val list = Sync[F].interruptibleMany {
val listing = JFiles.list(path.toNioPath)
try listing.iterator.asScala.map(p =>
(Path.fromNioPath(p), JFiles.readAttributes(p, classOf[JBasicFileAttributes]))).toVector
finally listing.close()
}
Stream.chunk(toEmit) ++ Stream.eval(list).flatMap(descendants => loop(newAcc, toWalk.drop(1) ++ descendants))
} else Stream.chunk(toEmit) ++ loop(newAcc, toWalk.drop(1))
}
}
Stream.eval(Sync[F].interruptibleMany {
start -> JFiles.readAttributes(start.toNioPath, classOf[JBasicFileAttributes])
}).flatMap { s => loop(Vector.empty, Vector(s)) }
} Performs better but still doesn't beat the
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alright, maybe we should switch to a version based on this: private def walkJustInTime(
start: Path,
maxDepth: Int,
followLinks: Boolean,
chunkSize: Int
): Stream[F, Path] = {
import scala.collection.immutable.Queue
def loop(toWalk0: Queue[(Path, JBasicFileAttributes)]): Stream[F, Path] = {
val partialWalk = Sync[F].interruptibleMany {
var acc = Vector.empty[Path]
var toWalk = toWalk0
while (acc.size < chunkSize && toWalk.nonEmpty) {
val (path, attr) = toWalk.head
toWalk = toWalk.drop(1)
acc = acc :+ path
if (attr.isDirectory) {
val listing = JFiles.list(path.toNioPath)
try {
val descendants = listing.iterator.asScala.map(p =>
(Path.fromNioPath(p), JFiles.readAttributes(p, classOf[JBasicFileAttributes]))).toVector
toWalk = toWalk ++ descendants
}
finally listing.close()
}
}
Stream.chunk(Chunk.from(acc)) ++ (if (toWalk.isEmpty) Stream.empty else loop(toWalk))
}
Stream.eval(partialWalk).flatten
}
Stream.eval(Sync[F].interruptibleMany {
start -> JFiles.readAttributes(start.toNioPath, classOf[JBasicFileAttributes])
}).flatMap(s => loop(Queue(s)))
}
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So basically what we're trying to figure out is whether it's worth eating 9% overhead to avoid blocking a thread which is already getting blocked by filesystem I/O? My guess is that it's not worth it but I shall ponder a bit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pushed a new version:
I'd like to add some tests for symbolic link following & max depth limits (we don't have any now). Then this PR should be good. |
||
bldr.clear() | ||
size = 0 | ||
if (result.isRight) FileVisitResult.CONTINUE else FileVisitResult.TERMINATE | ||
} else FileVisitResult.CONTINUE | ||
} | ||
|
||
override def visitFile(file: JPath, attrs: JBasicFileAttributes): FileVisitResult = | ||
enqueue(file) | ||
|
||
override def visitFileFailed(file: JPath, t: IOException): FileVisitResult = | ||
FileVisitResult.CONTINUE | ||
|
||
override def preVisitDirectory(dir: JPath, attrs: JBasicFileAttributes) | ||
: FileVisitResult = | ||
enqueue(dir) | ||
|
||
override def postVisitDirectory(dir: JPath, t: IOException): FileVisitResult = | ||
FileVisitResult.CONTINUE | ||
} | ||
) | ||
|
||
dispatcher.unsafeRunSync( | ||
if (size > 0) channel.closeWithElement(Chunk.from(bldr.result())) | ||
else channel.close | ||
) | ||
} | ||
channel.stream.unchunks.concurrently(Stream.eval(doWalk)) | ||
} | ||
} | ||
|
||
def createWatcher: Resource[F, Watcher[F]] = Watcher.default(this, F) | ||
|
||
def watch( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Btw @armanbilge one thing that occurs to me is that our fancy new unsafe queue thing isn't going to help very much if someone's using
Channel
.